# Simple Exercise on Overfitting

In [62]:
# If necesary, install functions
# import Pkg; Pkg.add("GLM")
# import Pkg; Pkg.add("DataFrames")

In [67]:
# Import functions
using LinearAlgebra, GLM, DataFrames, Statistics, Random

## 1. First set p=n


In [82]:
Random.seed!(1234)

n = 1000
p = n

# Create a 1000x1000 matrix of standard Gaussians
X = randn(n, p)

# Create a 1000x1 matrix of standard Gaussians
Y = randn(n)

# We can not run the regression below, because we need to have n>p otherwise error shows up.(I think it is because the matrix
# decomposition procedure)
# Fitted linear regression 
# fitted = lm(X,Y)

# This is a fuction that returns coeficients,R2 and Adj R2

function OLSestimator(Y, X)

    β = inv(X'*X)*(X'*Y)
    # β = X\Y
    errors = Y - X*β
    R_squared = 1.0 - sum(errors.^2.0)/sum((Y .- mean(Y)).^2.0)
    R_squared_adj =  1.0 - ( 1.0 - R_squared )*( size(Y)[1] - 1.0 )/( size(Y)[1]- size(X)[2] - 1.0 )    
    
    return β, R_squared, R_squared_adj
end

results_ols = OLSestimator(Y, X)

println("p/n is")
println(p/n)

print("R2 is \n")
println(results_ols[2])

print("Adjusted R2 is")
println(results_ols[3])

p/n is
1.0
R2 is 
1.0
Adjusted R2 is1.0


## 2. Second, set p=n/2.

In [12]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 1000
p = Int(n/2)

500

In [13]:
typeof(n)

Int64

In [14]:
typeof(p)

Int64

In [15]:
# Create a nxp matrix of standard Gaussians
X = randn(n, p)

# Create a nx1 matrix of standard Gaussians
Y = randn(n)

1000-element Vector{Float64}:
 -0.30015490103405207
 -0.22895676630021647
  0.7282028843878053
 -1.6533898186796983
 -0.09364106723489733
  0.6650864931423428
 -0.23256518807716417
  0.021328231157727952
 -1.2067049523328548
 -0.08869083083922816
 -0.14177106470083578
 -2.11340131801068
  0.5106451384482424
  ⋮
 -2.5048162995323193
  1.386638764088553
 -1.2053568302576134
 -2.7348676028700942
  0.37872121675764847
  0.937527099678896
 -0.7578173416199854
  0.17076265666581694
 -0.6054478946541058
 -0.5099968722799145
  1.698619884283135
 -2.6704416285884163

In [16]:
fitted = lm(X,Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
───────────────────────────────────────────────────────────────────────────
             Coef.  Std. Error      t  Pr(>|t|)     Lower 95%     Upper 95%
───────────────────────────────────────────────────────────────────────────
x1    -0.0261678     0.0446054  -0.59    0.5577  -0.113805      0.0614693
x2     0.014137      0.0465098   0.30    0.7613  -0.0772418     0.105516
x3     0.0145645     0.0452222   0.32    0.7475  -0.0742844     0.103413
x4     0.0604253     0.0453579   1.33    0.1834  -0.0286903     0.149541
x5    -0.0688461     0.044655   -1.54    0.1238  -0.156581      0.0188885
x6     0.0352607     0.0445869   0.79    0.4294  -0.0523402     0.122862
x7     0.0808652     0.0468315   1.73    0.0848  -0.0111456     0.172876
x8    -0.00926784    0.0460025  -0.20    0.8404  -0.0996499     0.0811142
x9    -0.0356026     0.0438818  -0.81    0.4176  -0.12181

In [17]:
println("p/n is")
println(p/n)

p/n is
0.5


In [18]:
print("R2 is")
r2(fitted)

R2 is

0.522759164993229

In [19]:
print("Adjusted R2 is")
adjr2(fitted)

Adjusted R2 is

0.04647281165647166

## 3. Third, set p/n =.05

In [186]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 1000
p = Int(0.05*n)

50

In [187]:
typeof(n)

Int64

In [188]:
typeof(p)

Int64

In [189]:
# Create a nxp matrix of standard Gaussians
X = randn(n, p)

# Create a nx1 matrix of standard Gaussians
Y = randn(n)

1000-element Vector{Float64}:
  1.167920811620045
 -0.3280411047077068
 -0.8445415160849165
  1.4553651114710981
  1.5140771128264483
  0.15609705126554174
  0.1020492425396467
 -0.20516975442750912
 -0.7742831329887733
 -1.2802008954860669
 -1.0419521195296317
  0.4867823028015282
  0.9859121021754983
  ⋮
  2.458186249870579
 -0.32607475483433235
 -0.6009803684621868
 -0.8538350833575432
 -0.5932227849943221
 -1.3403802566733682
  1.58209844750404
  0.4890557131029599
  0.5692655432112158
 -1.1819177091007875
 -0.8497959304846961
  2.197411166468226

In [190]:
fitted = lm(X,Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
──────────────────────────────────────────────────────────────────────
           Coef.  Std. Error      t  Pr(>|t|)    Lower 95%   Upper 95%
──────────────────────────────────────────────────────────────────────
x1    0.0116557    0.0325445   0.36    0.7203  -0.0522117   0.0755231
x2    0.0203533    0.0321537   0.63    0.5269  -0.0427472   0.0834538
x3   -0.0110659    0.0326532  -0.34    0.7348  -0.0751466   0.0530148
x4    0.00764628   0.0326984   0.23    0.8152  -0.0565231   0.0718157
x5    0.0455609    0.0337232   1.35    0.1770  -0.0206196   0.111741
x6   -0.0241702    0.0313633  -0.77    0.4411  -0.0857195   0.0373791
x7   -0.0437027    0.033999   -1.29    0.1990  -0.110425    0.0230192
x8   -0.025032     0.0331398  -0.76    0.4502  -0.0900677   0.0400037
x9   -0.0142894    0.0314473  -0.45    0.6497  -0.0760037   0.0474249
x10   0.00394858   0.0328441  

In [191]:
println("p/n is")
println(p/n)

p/n is
0.05


In [192]:
print("R2 is")
r2(fitted)

R2 is

0.04567414997924413

In [193]:
print("Adjusted R2 is")
adjr2(fitted)

Adjusted R2 is

-0.0035489728113000663