In [0]:
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix # optional
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

np.random.seed(42)

url = "https://raw.githubusercontent.com/erinmjohnston/MLproject/master/2016-postsecondary-enrollment.csv"
data = pd.read_csv(url)
data= data.drop("DIV_NAME", axis=1)
data = data.drop("COHORT_GRADUATE_CNT", axis=1)
data = data.drop("PS_ENROLLMENT_CNT", axis=1)

In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

attributes = ['FEDERAL_RACE_CODE', 'GENDER', 'DISABILITY_FLAG', 'LEP_FLAG', 'DISADVANTAGED_FLAG']

full_pipeline = ColumnTransformer([
    ("cat", OneHotEncoder(sparse=False), attributes)
])

encoder = OneHotEncoder(sparse=False)

prep_data_y = np.array([data["PS_ENROLLMENT_RATE"].values]).transpose()
prep_data_x = encoder.fit_transform(data[attributes])
feature_names = encoder.get_feature_names(attributes)
# feature_names = np.append(encoder.get_feature_names(attributes),"PS_ENROLLMENT_RATE")
# print(feature_names)

In [0]:
x_train, x_test, y_train, y_test = train_test_split(prep_data_x, prep_data_y, test_size=0.2, random_state=42)

In [0]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import SGDRegressor

mySGDModel = SGDRegressor()
mySGDModel.fit(x_train, y_train)
SGD_predict = mySGDModel.predict(x_test)
mse = mean_squared_error(y_test, SGD_predict)
mae = mean_absolute_error(y_test, SGD_predict)
mySGDModel_rmse = np.sqrt(mse)
print("Feature Coefficients:", mySGDModel.coef_)
print("SGD MAE: ", mae)
print("SGD MSE: ", mse)
print("SGD RMSE: ", mySGDModel_rmse)

Feature Coefficients: [0.00104685 0.104461   0.01595386 0.00453491 0.00130627 0.04029118
 0.14435195 0.02324211 0.15377406 0.01382    0.11351998 0.05407408
 0.16394167 0.00365239]
SGD MAE:  0.12158430935325411
SGD MSE:  0.02223750701069805
SGD RMSE:  0.14912245642658267


In [0]:
from sklearn.linear_model import LinearRegression

lin = LinearRegression()
lin.fit(x_train, y_train)
lin_predict = lin.predict(x_test)
mse = mean_squared_error(y_test, lin_predict)
lin_rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, lin_predict)
print("Feature Coefficients:", lin.coef_)
print("Linear Regression MAE: ", mae)
print("Linear Regression MSE: ", mse)
print("Linear Regression RMSE: ", lin_rmse)

Feature Coefficients: [[ 4.60929858e+11  4.60929858e+11  4.60929858e+11  4.60929858e+11
   4.60929858e+11  4.60929858e+11 -6.64697396e+12 -6.64697396e+12
   1.83960473e+11  1.83960473e+11  5.56470725e+12  5.56470725e+12
   4.08643171e+12  4.08643171e+12]]
Linear Regression MAE:  0.12228753375347222
Linear Regression MSE:  0.022263108839484392
Linear Regression RMSE:  0.14920827336138032


In [0]:
from sklearn import linear_model 

reg = linear_model.Ridge(alpha=.5)
reg.fit(x_train, y_train)
reg_predict = reg.predict(x_test)
mse = mean_squared_error(y_test, reg_predict)
reg_rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, reg_predict)

coef = reg.coef_
coef = pd.DataFrame(coef, columns=feature_names)
print("Feature Coefficients:")
print(coef)
print("Ridge Regression MAE: ", mae)
print("Ridge Regression MSE: ", mse)
print("Ridge Regression RMSE: ", reg_rmse)

Feature Coefficients:
   FEDERAL_RACE_CODE_1  ...  DISADVANTAGED_FLAG_Y
0             0.013653  ...              -0.07878

[1 rows x 14 columns]
Ridge Regression MAE:  0.12232832800742607
Ridge Regression MSE:  0.022235403334532815
Ridge Regression RMSE:  0.1491154027407391


In [0]:
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation", scores.std())
    print()

SGD_scores = cross_val_score(mySGDModel, x_train, y_train, 
                         scoring = "neg_mean_squared_error", cv=120)

SGD_rmse_scores = np.sqrt(-SGD_scores)

lin_scores = cross_val_score(lin, x_train, y_train, 
                         scoring = "neg_mean_squared_error", cv=120)

lin_rmse_scores = np.sqrt(-lin_scores)

reg_scores = cross_val_score(reg, x_train, y_train, 
                         scoring = "neg_mean_squared_error", cv=120)

reg_rmse_scores = np.sqrt(-reg_scores)

print("SGD:")
display_scores(SGD_rmse_scores)
print("linear regression:")
display_scores(lin_rmse_scores)
print("ridge regression:")
display_scores(reg_rmse_scores)
print("Feature Coefficients:", reg.coef_)

SGD:
Scores: [0.08735001 0.09372358 0.17681805 0.09505034 0.11003537 0.12006504
 0.10507946 0.11969911 0.09060129 0.12122964 0.21191073 0.13882643
 0.15526613 0.18673967 0.16115789 0.10234551 0.12150819 0.1792844
 0.14305981 0.20008501 0.20663104 0.17267046 0.11065469 0.15675359
 0.10204947 0.10254531 0.19468663 0.14159857 0.08670979 0.08161144
 0.0934205  0.07650773 0.15407061 0.16203099 0.13122701 0.11968598
 0.21388015 0.1409204  0.1745795  0.13230947 0.12913207 0.15726539
 0.15172632 0.09829166 0.14278337 0.13125433 0.11190201 0.1469233
 0.19360584 0.19431227 0.25708306 0.23129638 0.07752393 0.16370045
 0.11798672 0.13394676 0.0892751  0.13910832 0.12156012 0.11055122
 0.05784    0.09205442 0.11846655 0.11899767 0.12622461 0.12489046
 0.10332929 0.17774604 0.14127466 0.12290213 0.09944566 0.15075556
 0.13741848 0.18425708 0.12038929 0.07963563 0.21190254 0.21616865
 0.13193904 0.17925191 0.09944951 0.11716795 0.09440642 0.23139603
 0.10266746 0.22497872 0.11171472 0.07535828 0.1753

In [0]:
from sklearn import linear_model 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error


reg_predict =  cross_val_predict(reg, x_train, y_train, cv=15)


Feature Weights:


Asian                        0.145284           
Black                       -0.041909   
Hispanic                 -0.074799
White -0.053822
FEDERAL_RACE_CODE_99 0.011592
Female 0.061532
Male -0.061532
DISABILITY_FLAG_N 0.064228
DISABILITY_FLAG_Y -0.064228
LEP_FLAG_N 0.018396
LEP_FLAG_Y -0.018396
DISADVANTAGED_FLAG_N 0.07878
DISADVANTAGED_FLAG_Y -0.07878