# Full Data Anaylsis 
Run regular linear regression, ridge regression and lasso regression with the full data set.

In [72]:
import pandas as pd
import numpy as np
from math import sqrt

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import kurtosis, skew

from sklearn.linear_model    import LinearRegression, Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model    import RidgeCV
from sklearn.pipeline        import make_pipeline
from sklearn.preprocessing   import PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing   import StandardScaler
from sklearn.metrics         import mean_squared_error
from sklearn.linear_model    import Lasso
from sklearn                 import linear_model

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LassoCV

from statistics import mean

from pandas import DataFrame

from sklearn.neighbors import KNeighborsRegressor

In [18]:
def run_analysis(name, res):
    d = pd.read_csv(name)
    y = d['Sr_PR']
    X = d

    del X['Unnamed: 0']
    del X["ID"]
    del X["Name"]
    del X["School_ID"]
    del X["School"]
    del X["Sr_PR"]
    
    enc = OneHotEncoder(handle_unknown='ignore')
    X = enc.fit_transform(X)

    X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=137)

    lr = LinearRegression()
    lr.fit(X_train, y_train)
    lr.score(X_train,y_train)
    y_pred = lr.predict(X_test)
 
    print(name)
    print('Train R^2: ',lr.score(X_train, y_train))
    print('Train RMSE:', 
          sqrt(mean_squared_error(y_train, lr.predict(X_train))))
    print('Test R^2: ', lr.score(X_test, y_test))
    print('Test RMSE:', 
            sqrt(mean_squared_error(y_test, lr.predict(X_test))))
    
    res.append([lr.score(X_train, y_train),sqrt(mean_squared_error(y_train, lr.predict(X_train))), 
               lr.score(X_test, y_test), sqrt(mean_squared_error(y_test, lr.predict(X_test)))])



In [19]:
res = []
files = ["100_Women_Results.csv","100_Men_Results.csv", "5000_Women_Results.csv", "5000_Men_Results.csv", 
         "PV_Women_Results.csv", "PV_Men_Results.csv", "TJ_Women_Results.csv", "TJ_Men_Results.csv" ]

for name in files:
    run_analysis(name, res)
    
df = DataFrame(res, columns=[1, 2, 3, 4])
df

100_Women_Results.csv
Train R^2:  0.9999999999999762
Train RMSE: 8.018438174050556e-08
Test R^2:  0.52428975791844
Test RMSE: 0.32086892618333096
100_Men_Results.csv
Train R^2:  0.9999999999999893
Train RMSE: 4.380422734546053e-08
Test R^2:  0.25666189062643074
Test RMSE: 0.33466160429912095
5000_Women_Results.csv
Train R^2:  0.9999999999999895
Train RMSE: 1.5959981975382546e-05
Test R^2:  0.16213787712617844
Test RMSE: 198.0407675905636
5000_Men_Results.csv
Train R^2:  0.999999999999991
Train RMSE: 4.229962019941823e-06
Test R^2:  0.4649242239105126
Test RMSE: 36.97145384308919
PV_Women_Results.csv
Train R^2:  0.9999999999999858
Train RMSE: 1.8087117569452591e-07
Test R^2:  -0.47593599351129234
Test RMSE: 0.39597206575871896
PV_Men_Results.csv
Train R^2:  0.9999999999999906
Train RMSE: 1.5614451589003383e-07
Test R^2:  0.07096158455173573
Test RMSE: 1.9508676874464441
TJ_Women_Results.csv
Train R^2:  0.9999999999999952
Train RMSE: 1.2859833047835678e-07
Test R^2:  0.9071862594866187
T

Unnamed: 0,1,2,3,4
0,1.0,8.018438e-08,0.52429,0.320869
1,1.0,4.380423e-08,0.256662,0.334662
2,1.0,1.595998e-05,0.162138,198.040768
3,1.0,4.229962e-06,0.464924,36.971454
4,1.0,1.808712e-07,-0.475936,0.395972
5,1.0,1.561445e-07,0.070962,1.950868
6,1.0,1.285983e-07,0.907186,1.211311
7,1.0,2.228814e-07,-2.201308,1.837482


In [68]:
def run_analysis_ridge(name, res):
    d = pd.read_csv(name)
    y = d['Sr_PR']
    X = d

    del X['Unnamed: 0']
    del X["ID"]
    del X["Name"]
    del X["School_ID"]
    del X["School"]
    del X["Sr_PR"]
    enc = OneHotEncoder(handle_unknown='ignore')
    X = enc.fit_transform(X)

    X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=137)


    lr = Ridge(alpha = 1.5)
    lr.fit(X_train, y_train)
    lr.score(X_train,y_train)

    y_pred = lr.predict(X_test)

\  
    print(name)
    print('Train R^2: ',lr.score(X_train, y_train))
    print('Train RMSE:', 
          sqrt(mean_squared_error(y_train, lr.predict(X_train))))
    print('Test R^2: ', lr.score(X_test, y_test))
    print('Test RMSE:', 
            sqrt(mean_squared_error(y_test, lr.predict(X_test))))
    
    res.append([lr.score(X_train, y_train),sqrt(mean_squared_error(y_train, lr.predict(X_train))), 
               lr.score(X_test, y_test), sqrt(mean_squared_error(y_test, lr.predict(X_test)))])




In [69]:
res = []
files = ["100_Women_Results.csv","100_Men_Results.csv", "5000_Women_Results.csv", "5000_Men_Results.csv", 
         "PV_Women_Results.csv", "PV_Men_Results.csv", "TJ_Women_Results.csv", "TJ_Men_Results.csv" ]

for name in files:
    run_analysis_ridge(name, res)
    
df = DataFrame(res, columns=[1, 2, 3, 4])
df

100_Women_Results.csv
Train R^2:  0.9034367305568188
Train RMSE: 0.16171943163628016
Test R^2:  0.45904542359644396
Test RMSE: 0.34216596589083137
100_Men_Results.csv
Train R^2:  0.9177170179769076
Train RMSE: 0.12197973431095538
Test R^2:  0.27091629603628664
Test RMSE: 0.331437301914579
5000_Women_Results.csv
Train R^2:  0.9044508368461424
Train RMSE: 48.117001218757224
Test R^2:  0.25127560115063563
Test RMSE: 187.210117121207
5000_Men_Results.csv
Train R^2:  0.9332328059630923
Train RMSE: 11.509252237722045
Test R^2:  0.3542160264018016
Test RMSE: 40.61650173637367
PV_Women_Results.csv
Train R^2:  0.9166358859832549
Train RMSE: 0.43743834120869257
Test R^2:  -0.15075603524991954
Test RMSE: 0.3496410517182923
PV_Men_Results.csv
Train R^2:  0.9027215697004168
Train RMSE: 0.500863844650941
Test R^2:  0.04711159068232307
Test RMSE: 1.9757500495781763
TJ_Women_Results.csv
Train R^2:  0.8993649767341294
Train RMSE: 0.588606438886384
Test R^2:  0.8659232260127132
Test RMSE: 1.455882492369

Unnamed: 0,1,2,3,4
0,0.903437,0.161719,0.459045,0.342166
1,0.917717,0.12198,0.270916,0.331437
2,0.904451,48.117001,0.251276,187.210117
3,0.933233,11.509252,0.354216,40.616502
4,0.916636,0.437438,-0.150756,0.349641
5,0.902722,0.500864,0.047112,1.97575
6,0.899365,0.588606,0.865923,1.455882
7,0.901936,1.566231,-1.643686,1.6698


In [66]:
def run_analysis_lasso(name, res):
    d = pd.read_csv(name)
    y = d['Sr_PR']
    X = d

    del X["ID"]
    del X['Unnamed: 0']
    del X["Name"]
    del X["School_ID"]
    del X["School"]
    del X["Sr_PR"]
    

    enc = OneHotEncoder(handle_unknown='ignore')
    X = enc.fit_transform(X)

    X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=137)

    
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
    model = LassoCV()
    model.fit(X_train, y_train)
    
    lst = []
    lst.append(model.alpha_)
    y_pred = model.predict(X_test)

    
    print(name)
    print('Train R^2: ',model.score(X_train, y_train))
    print('Train RMSE:', 
          sqrt(mean_squared_error(y_train, model.predict(X_train))))
    print('Test R^2: ', model.score(X_test, y_test))
    print('Test RMSE:', 
            sqrt(mean_squared_error(y_test, model.predict(X_test))))
    
    res.append(lst + [model.score(X_train, y_train),sqrt(mean_squared_error(y_train, model.predict(X_train))), 
               model.score(X_test, y_test), sqrt(mean_squared_error(y_test, model.predict(X_test)))])



In [67]:
res = []
files = ["100_Women_Results.csv","100_Men_Results.csv", "5000_Women_Results.csv", "5000_Men_Results.csv", 
         "PV_Women_Results.csv", "PV_Men_Results.csv", "TJ_Women_Results.csv", "TJ_Men_Results.csv" ]

for name in files:
    run_analysis_lasso(name, res)
    
df = DataFrame(res, columns=[0, 1, 2, 3, 4])
df

100_Women_Results.csv
Train R^2:  0.8189256043939803
Train RMSE: 0.22145462134113292
Test R^2:  0.32878281420178224
Test RMSE: 0.38114299623799286
100_Men_Results.csv
Train R^2:  0.9406568330940362
Train RMSE: 0.10359004729260153
Test R^2:  0.07423638641898778
Test RMSE: 0.37347603790798606
5000_Women_Results.csv
Train R^2:  0.619304655179991
Train RMSE: 96.04480792679227
Test R^2:  0.6330433222182196
Test RMSE: 131.06167671043147
5000_Men_Results.csv
Train R^2:  0.37345824792027515
Train RMSE: 35.25661022246985
Test R^2:  0.34601533754834957
Test RMSE: 40.873578814354985
PV_Women_Results.csv
Train R^2:  0.9921245875419399
Train RMSE: 0.13445096719055158
Test R^2:  0.5238570229394021
Test RMSE: 0.22490519067205156
PV_Men_Results.csv
Train R^2:  0.9966211612097924
Train RMSE: 0.09334587724548608
Test R^2:  -0.0006360990767813046
Test RMSE: 2.024645826497728
TJ_Women_Results.csv
Train R^2:  0.9147652803320558
Train RMSE: 0.5416998327753278
Test R^2:  0.9589122153816797
Test RMSE: 0.80594

Unnamed: 0,0,1,2,3,4
0,0.000988,0.818926,0.221455,0.328783,0.381143
1,0.000476,0.940657,0.10359,0.074236,0.373476
2,1.631314,0.619305,96.044808,0.633043,131.061677
3,0.327242,0.373458,35.25661,0.346015,40.873579
4,0.000726,0.992125,0.134451,0.523857,0.224905
5,0.000436,0.996621,0.093346,-0.000636,2.024646
6,0.004493,0.914765,0.5417,0.958912,0.805946
7,0.00022,0.999958,0.03254,0.366965,0.817096
