In [1]:
# Julia 0.7
using CSV, DataFrames, Gadfly, GLM, Distributions

┌ Info: Loading DataFrames support into Gadfly.jl
└ @ Gadfly /Users/jonathan/.julia/packages/Gadfly/ew1SM/src/mapping.jl:228


In [2]:
data = CSV.read("tuition_vs_salary.csv")

Unnamed: 0_level_0,University,Tuition,Salary
Unnamed: 0_level_1,String⍰,Int64⍰,Int64⍰
1,Harvey Mudd College,50649,157400
2,Massachusetts Institute of Technology,46704,150400
3,Stanford University,46320,143100
4,California Institute of Technology,45390,143100
5,Harvard University,45278,142600
6,Princeton University,43450,141300
7,Yale University,47600,135400
8,University of California-Berkeley,13431,132300
9,Pennsylvania State University-Main Campus,17514,103100
10,Ohio State University-Main Campus,10037,95100


In [None]:
# Traçage des variables explicatives
fig1 = plot(data, x=:Tuition, y=:Salary, Geom.point)

In [None]:
X₁ = collect(skipmissing(data[:Tuition]))
Y = collect(skipmissing(data[:Salary]))
n = length(Y)
X = hcat(ones(n),X₁)
p = size(X,2)-1

In [None]:
β̂ = (X'*X)\X'*Y

sample = layer( x=X[:,2], y=Y, Geom.point, Theme(default_color="deepskyblue"))
xx = hcat(ones(10),range(minimum(X₁),stop=maximum(X₁),length=10))
regression = layer(x=xx[:,2], y = xx*β̂, Geom.line, Theme(default_color="red"))

plot(sample,regression ,
    Guide.manual_color_key("Légende", ["Échantillon", "Régression"], ["deepskyblue","red"]),
    Guide.xlabel("Tuition"), Guide.ylabel("Salary"))


In [None]:
# Validation des hypothèses 1 et 2
Ŷ = X*β̂
r = Y .- Ŷ
plot(x=Ŷ, y=r, Geom.point, Guide.xlabel("Valeur prédite"), Guide.ylabel("Résidu"))

In [None]:
# Validation de l'hypothèse 4
empirical = layer(x=Normal(), y=r/std(r), Stat.qq, Geom.point, Theme(default_color="deepskyblue"))
theoretical = layer(x=[-2 2], y=[-2 2], Geom.line, Theme(default_color="red"))
plot(empirical,theoretical,Guide.xlabel("Quantile théorique"), Guide.ylabel("Quantile empirique"))

In [None]:
SST = sum( (Y.- mean(Y) ).^2)
SSE = sum( r.^2 )
SSR = SST - SSE

F₀ = (SSR/1) / (SSE/(n-1-1)) 

# On rejette H₀ au seuil de 5% si la valeur-p associé à F₀ est plus petite que 5%
valeurp = ccdf(FDist(p,n-p-1),F₀)
if valeurp<0.05
    println("Comme la valeur-p = $valeurp<0.05, on rejette H_0. La régression est donc significative.")
else
    println("Comme la valeur-p = $valeurp>0.05, on ne rejette pas H_0. La régression n'est donc pas significative.")
end

# Calcul du coefficient de détermination
R² = SSR/SST
println("Le coefficient de détermination est $R²")