In [16]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.metrics import r2_score

In [6]:
# load our newly transformed dataframe

df_m = pd.read_csv('../data/processed/student-mat-cleaned.csv')

In order to proceed with our statistical analysis, we need to formulate our null and alternative hypothesis based on the topic we have been tackling. So far, we have been basing our analysis around whether or not alcoholic drinking during the weekday and weekend has an effect on student academic performance. We have also tackled the possibility of whether or not the students living together or not had an effect on the students academic perfomance.

Since there was only 41 students who's parents were seperated compared to the 354 who lived together, we can get rid of that topic since we do not have enough data to determine whether or not parents living together or not would affect students academic performance.

So for now, our null hypothesis for the topics we were analyzing is that there is no impact between drinking during the week and on the weekends and student academic performance. The alternative hypothesis is that there is an influence with drinking and students academic performance.

In [7]:
stats.ttest_ind(df_m['Walc'], df_m['G1'])

TtestResult(statistic=-48.106581678312274, pvalue=1.066679111027966e-236, df=788.0)

In [8]:
stats.ttest_ind(df_m['Walc'], df_m['G2'])

TtestResult(statistic=-42.10380578098366, pvalue=7.398123109067024e-204, df=788.0)

In [9]:
stats.ttest_ind(df_m['Walc'], df_m['G3'])

TtestResult(statistic=-33.92762208852603, pvalue=3.052876947654766e-156, df=788.0)

In [10]:
stats.ttest_ind(df_m['Dalc'], df_m['G1'])

TtestResult(statistic=-54.522714364484365, pvalue=1.1987190197925897e-269, df=788.0)

In [11]:
stats.ttest_ind(df_m['Dalc'], df_m['G2'])

TtestResult(statistic=-47.47095222350131, pvalue=2.592268385219424e-233, df=788.0)

In [12]:
stats.ttest_ind(df_m['Dalc'], df_m['G3'])

TtestResult(statistic=-38.04467680540229, pvalue=1.3588855289948016e-180, df=788.0)

# Linear Regression Modeling

In [13]:
y = df_m['G3']
x = df_m.drop('G3', axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=68)

In [14]:
model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [15]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error: ', mse)
print('R-squared:', r2)

Mean Squared Error:  5.159109091114173
R-squared: 0.79768240118271


# Ridge Regression Model

In [17]:
# Ridge Regression Model
ridgeReg = Ridge(alpha=10)

ridgeReg.fit(x_train, y_train)

train_score_ridge = ridgeReg.score(x_train, y_train)
test_score_ridge = ridgeReg.score(x_test, y_test)

print("The train score for ridge model is {}".format(train_score_ridge))
print("The test score for ridge model is {}".format(test_score_ridge))

The train score for ridge model is 0.8402653586165508
The test score for ridge model is 0.7976315083661282


# Lasso Regression Model

In [19]:
# Lasso Regression Model

lasso = Lasso(alpha=10)
lasso.fit(x_train, y_train)
train_score_lasso = lasso.score(x_train, y_train)
test_score_lasso = lasso.score(x_test, y_test)

print("The train score for lasso model is {}".format(train_score_lasso))
print("The test score for lasso model is {}".format(test_score_lasso))

The train score for lasso model is 0.46239462200687276
The test score for lasso model is 0.4103450043103516
