In [56]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.metrics import r2_score

In [18]:
# load our newly transformed dataframe

df_p = pd.read_csv('../data/processed/student-por-cleaned.csv')

In order to proceed with our statistical analysis, we need to formulate our null and alternative hypothesis based on the topic we have been tackling. So far, we have been basing our analysis around whether or not alcoholic drinking during the weekday and weekend has an effect on student academic performance. We have also tackled the possibility of whether or not the students living together or not had an effect on the students academic perfomance.

Since there was only 80 students who's parents were seperated compared to the 569 who lived together, we can get rid of that topic since we do not have enough data to determine whether or not parents living together or not would affect students academic performance.

So for now, our null hypothesis for the topics we were analyzing is that there is no impact between drinking during the week and on the weekends and student academic performance. The alternative hypothesis is that there is an influence with drinking and students academic performance.

In [19]:
stats.ttest_ind(df_p['Walc'], df_p['G1'])

TtestResult(statistic=-76.64551234051483, pvalue=0.0, df=1296.0)

In [20]:
stats.ttest_ind(df_p['Walc'], df_p['G2'])

TtestResult(statistic=-74.32365190647002, pvalue=0.0, df=1296.0)

In [21]:
stats.ttest_ind(df_p['Walc'], df_p['G3'])

TtestResult(statistic=-70.53326672768377, pvalue=0.0, df=1296.0)

In [22]:
stats.ttest_ind(df_p['Dalc'], df_p['G1'])

TtestResult(statistic=-87.0338164860214, pvalue=0.0, df=1296.0)

In [23]:
stats.ttest_ind(df_p['Dalc'], df_p['G2'])

TtestResult(statistic=-83.90275373784905, pvalue=0.0, df=1296.0)

In [24]:
stats.ttest_ind(df_p['Dalc'], df_p['G3'])

TtestResult(statistic=-78.87072517302465, pvalue=0.0, df=1296.0)

# Linear Regression Modeling

In [52]:
y = df_p['G3']
x = df_p.drop('G3', axis = 1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=68)

In [None]:
# Scaling our data for regression
scale = StandardScaler()
scale.fit(x_train)
scale.transform(x_train)

In [53]:
model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

In [55]:
# MSE and R squared values are outputed here

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error: ', mse)
print('R-squared:', r2)

Mean Squared Error:  2.414679582266468
R-squared: 0.7904657301172577


# Ridge Regression Model

In [57]:
# Ridge Regression Model
ridgeReg = Ridge(alpha=10)

ridgeReg.fit(x_train, y_train)

train_score_ridge = ridgeReg.score(x_train, y_train)
test_score_ridge = ridgeReg.score(x_test, y_test)

print("The train score for ridge model is {}".format(train_score_ridge))
print("The test score for ridge model is {}".format(test_score_ridge))

The train score for ridge model is 0.8707029447557642
The test score for ridge model is 0.7898750588326082


# Lasso Regression Model

In [60]:
# Lasso Regression Model

lasso = Lasso(alpha=10)
lasso.fit(x_train, y_train)
train_score_lasso = lasso.score(x_train, y_train)
test_score_lasso = lasso.score(x_test, y_test)

print("The train score for lasso model is {}".format(train_score_lasso))
print("The test score for lasso model is {}".format(test_score_lasso))

The train score for lasso model is 0.03310317955891762
The test score for lasso model is 0.013999121222577293
