# Simple Exercise on Overfitting

In [1]:
# If necesary, install functions
# import Pkg; Pkg.add("GLM")
# import Pkg; Pkg.add("DataFrames")

In [2]:
# Import functions
using LinearAlgebra, GLM, DataFrames, Statistics, Random

## 1. First set p=n


In [3]:
Random.seed!(1234)

n = 1000
p = n

# Create a 1000x1000 matrix of standard Gaussians
X = randn(n, p)

# Create a 1000x1 matrix of standard Gaussians
Y = randn(n)

# We can not run the regression below, because we need to have n>p otherwise error shows up.(I think it is because the matrix
# decomposition procedure)
# Fitted linear regression 
# fitted = lm(X,Y)

# This is a fuction that returns coeficients,R2 and Adj R2

function OLSestimator(Y, X)

    β = inv(X'*X)*(X'*Y)
    # β = X\Y
    errors = Y - X*β
    R_squared = 1.0 - sum(errors.^2.0)/sum((Y .- mean(Y)).^2.0)
    R_squared_adj =  1.0 - ( 1.0 - R_squared )*( size(Y)[1] - 1.0 )/( size(Y)[1]- size(X)[2] - 1.0 )    
    
    return β, R_squared, R_squared_adj
end

results_ols = OLSestimator(Y, X)

println("p/n is")
println(p/n)

print("R2 is \n")
println(results_ols[2])

print("Adjusted R2 is")
println(results_ols[3])

p/n is


1.0
R2 is 
1.0
Adjusted R2 is1.0


## 2. Second, set p=n/2.

In [4]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 1000
p = Int(n/2)

500

In [5]:
typeof(n)

Int64

In [6]:
typeof(p)

Int64

In [7]:
# Create a nxp matrix of standard Gaussians
X = randn(n, p)

# Create a nx1 matrix of standard Gaussians
Y = randn(n)

1000-element Vector{Float64}:
 -0.7778873488524721
 -0.27826446125803916
 -0.011144975398001689
 -1.0981657588764657
 -1.2088758985491925
 -1.0192912103159015
 -1.1418874258552272
 -0.8245877193228884
  1.5524067169771316
  0.7090651605707038
  ⋮
 -0.032534335390493364
 -1.144674448137436
  0.46399588407671727
  0.45173726531429176
 -1.2985145031108982
 -0.2547426124349291
 -1.742204223094074
 -0.3240561539760303
  1.143900382722039
  0.10515979434676617

In [8]:
fitted = lm(X,Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
───────────────────────────────────────────────────────────────────────────
             Coef.  Std. Error      t  Pr(>|t|)     Lower 95%     Upper 95%
───────────────────────────────────────────────────────────────────────────
x1    -0.066894      0.0449574  -1.49    0.1374  -0.155223      0.0214347
x2    -0.0512237     0.0453923  -1.13    0.2597  -0.140407      0.0379594
x3    -0.00863479    0.0444259  -0.19    0.8460  -0.0959192     0.0786496
x4     0.0517948     0.0443612   1.17    0.2435  -0.0353626     0.138952
x5     0.0359121     0.0453187   0.79    0.4285  -0.0531265     0.124951
x6     0.018522      0.0445151   0.42    0.6775  -0.0689377     0.105982
x7    -0.0417572     0.0420076  -0.99    0.3207  -0.12429       0.040776
x8    -0.00215463    0.0428373  -0.05    0.9599  -0.086318      0.0820087
x9     0.0924483     0.0432439   2.14    0.0330   0.0074

In [9]:
println("p/n is")
println(p/n)

p/n is
0.5


In [10]:
print("R2 is")
r2(fitted)

R2 is

0.49480312764499856

In [11]:
print("Adjusted R2 is")
adjr2(fitted)

Adjusted R2 is

-0.009383350965292747

## 3. Third, set p/n =.05

In [12]:
# We have to make sure that both variables are the same type (Integers or floats) to avoid errors when running the regression
n = 1000
p = Int(0.05*n)

50

In [13]:
typeof(n)

Int64

In [14]:
typeof(p)

Int64

In [15]:
# Create a nxp matrix of standard Gaussians
X = randn(n, p)

# Create a nx1 matrix of standard Gaussians
Y = randn(n)

1000-element Vector{Float64}:
 -0.7004399360253225
 -1.725935064888198
 -0.6771200228212352
 -0.46971747972814115
  0.6232779927229138
  2.2159234921652615
  0.4828988108077087
  1.221778818527537
 -1.0857102577248317
  0.8589489241352118
  ⋮
  0.45133211859320993
  0.9158664076106069
 -0.4721324025810449
  0.17836851610814844
  0.276417746121243
  1.069723137321165
  0.5086636917773945
  0.9959568003890946
  0.10960865214423544
  2.065499208991642

In [16]:
fitted = lm(X,Y)

LinearModel{GLM.LmResp{Vector{Float64}}, GLM.DensePredChol{Float64, CholeskyPivoted{Float64, Matrix{Float64}}}}:

Coefficients:
───────────────────────────────────────────────────────────────────────
           Coef.  Std. Error      t  Pr(>|t|)    Lower 95%    Upper 95%
───────────────────────────────────────────────────────────────────────
x1   -0.03127      0.0325922  -0.96    0.3376  -0.095231    0.0326911
x2    0.0483691    0.0320516   1.51    0.1316  -0.014531    0.111269
x3    0.0166312    0.0316983   0.52    0.5999  -0.0455755   0.0788379
x4   -0.00819057   0.0320481  -0.26    0.7983  -0.0710838   0.0547026
x5   -0.00981615   0.0320041  -0.31    0.7591  -0.0726231   0.0529908
x6    0.0469078    0.0334549   1.40    0.1612  -0.0187462   0.112562
x7    0.0274112    0.0331971   0.83    0.4092  -0.0377369   0.0925594
x8   -0.0391661    0.0318588  -1.23    0.2192  -0.101688    0.0233556
x9    0.0166961    0.0315433   0.53    0.5967  -0.0452065   0.0785988
x10  -0.00446421   0.0333214

In [17]:
println("p/n is")
println(p/n)

p/n is
0.05


In [18]:
print("R2 is")
r2(fitted)

R2 is

0.044186224291295706

In [19]:
print("Adjusted R2 is")
adjr2(fitted)

Adjusted R2 is

-0.005113644139995488