In [1]:
import requests
import zipfile
import pandas as pd

### Fetch the dataset

In [2]:
response = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip')
zipcontent= response.content
with open("/tmp/student.zip", 'wb') as f:
    f.write(zipcontent)

zip_ref = zipfile.ZipFile('/tmp/student.zip', 'r')
zip_ref.extractall('/tmp/student/')
zip_ref.close()

### Load the datasets for grades in Maths and Portuguese

In [3]:
df_mat = pd.read_csv('/tmp/student/student-mat.csv', delimiter=';')
df_por = pd.read_csv('/tmp/student/student-por.csv', delimiter=';')

### Have a brief look at the dataset content

In [4]:
df_mat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
school        395 non-null object
sex           395 non-null object
age           395 non-null int64
address       395 non-null object
famsize       395 non-null object
Pstatus       395 non-null object
Medu          395 non-null int64
Fedu          395 non-null int64
Mjob          395 non-null object
Fjob          395 non-null object
reason        395 non-null object
guardian      395 non-null object
traveltime    395 non-null int64
studytime     395 non-null int64
failures      395 non-null int64
schoolsup     395 non-null object
famsup        395 non-null object
paid          395 non-null object
activities    395 non-null object
nursery       395 non-null object
higher        395 non-null object
internet      395 non-null object
romantic      395 non-null object
famrel        395 non-null int64
freetime      395 non-null int64
goout         395 non-null int64
Dalc          395 no

### Check if there are any outliers in the dataset

In [5]:
df_mat.isnull().sum()

school        0
sex           0
age           0
address       0
famsize       0
Pstatus       0
Medu          0
Fedu          0
Mjob          0
Fjob          0
reason        0
guardian      0
traveltime    0
studytime     0
failures      0
schoolsup     0
famsup        0
paid          0
activities    0
nursery       0
higher        0
internet      0
romantic      0
famrel        0
freetime      0
goout         0
Dalc          0
Walc          0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

### Convert categorical variables to one hot encoding, keep numerical vars as is.

In [6]:
df_mat = pd.get_dummies(df_mat)
df_mat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 59 columns):
age                  395 non-null int64
Medu                 395 non-null int64
Fedu                 395 non-null int64
traveltime           395 non-null int64
studytime            395 non-null int64
failures             395 non-null int64
famrel               395 non-null int64
freetime             395 non-null int64
goout                395 non-null int64
Dalc                 395 non-null int64
Walc                 395 non-null int64
health               395 non-null int64
absences             395 non-null int64
G1                   395 non-null int64
G2                   395 non-null int64
G3                   395 non-null int64
school_GP            395 non-null float64
school_MS            395 non-null float64
sex_F                395 non-null float64
sex_M                395 non-null float64
address_R            395 non-null float64
address_U            395 non-null float64
fam

### Binary categorical variables can be encoding using only 1 var, so let's delete the other var

In [7]:
redundant_vars = [
    'school_MS',
    'sex_M',
    'address_U',
    'famsize_LE3',
    'Pstatus_T',
    'schoolsup_no',
    'famsup_no',
    'paid_no',
    'activities_no',
    'nursery_no',
    'higher_no',
    'internet_no',
    'romantic_no'
]
df_mat.drop(redundant_vars, axis=1, inplace=True)
df_mat.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,17,1,1,1,2,0,5,3,3,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,15,1,1,1,2,3,4,3,2,2,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
3,15,4,2,1,3,0,3,2,2,1,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,16,3,3,1,2,0,4,3,2,1,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0


### Correlations for success in Maths

In [8]:
corr_mat = df_mat.corr()
corr_mat_G3 = df_mat.corr()['G3']
corr_mat_G3.sort_values()

failures            -0.360415
age                 -0.161579
goout               -0.132791
romantic_yes        -0.129970
traveltime          -0.117142
Mjob_at_home        -0.115634
address_R           -0.105756
sex_F               -0.103456
reason_course       -0.098950
Mjob_other          -0.096477
guardian_other      -0.087774
schoolsup_yes       -0.082788
famsize_GT3         -0.081407
health              -0.061335
Dalc                -0.054660
Fjob_other          -0.053483
Walc                -0.051939
famsup_yes          -0.039157
reason_home         -0.021359
Fjob_services       -0.016108
Fjob_at_home        -0.013385
freetime             0.011307
activities_yes       0.016100
guardian_mother      0.022338
guardian_father      0.032493
absences             0.034247
school_GP            0.045017
famrel               0.051363
nursery_yes          0.051568
reason_other         0.052008
Fjob_health          0.057111
Mjob_teacher         0.057712
Pstatus_A            0.058009
Mjob_servi

In [9]:
df_por = pd.get_dummies(df_por)
df_por.drop(redundant_vars, axis=1, inplace=True)
df_por.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,18,4,4,2,2,0,4,3,4,1,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1,17,1,1,1,2,0,5,3,3,1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,15,1,1,1,2,0,4,3,2,2,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
3,15,4,2,1,3,0,3,2,2,1,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,16,3,3,1,2,0,4,3,2,1,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


### Correlations for success in Portuguese

In [10]:
corr_por = df_por.corr()
corr_por_G3 = df_por.corr()['G3']
corr_por_G3.sort_values()

failures            -0.393316
Dalc                -0.204719
Walc                -0.176619
address_R           -0.167637
Mjob_at_home        -0.136778
reason_other        -0.132577
traveltime          -0.127173
freetime            -0.122705
age                 -0.106505
health              -0.098851
reason_course       -0.098305
absences            -0.091379
romantic_yes        -0.090583
goout               -0.087641
guardian_other      -0.080729
schoolsup_yes       -0.066405
Mjob_other          -0.059251
paid_yes            -0.054898
Fjob_services       -0.053204
famsize_GT3         -0.045016
Fjob_at_home        -0.038904
Fjob_other          -0.005301
guardian_mother     -0.004415
Pstatus_A            0.000754
nursery_yes          0.028752
Mjob_services        0.038447
Fjob_health          0.039142
reason_home          0.046537
guardian_father      0.051030
famsup_yes           0.059206
activities_yes       0.059791
famrel               0.063361
Mjob_health          0.101244
Fjob_teach

## Train linear regression model on the Maths dataset

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [12]:
X = df_mat.copy()
X.drop(['failures','G1','G2','G3'], axis=1, inplace=True)
y= df_mat['G3']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [14]:
model = LinearRegression()
model.fit(X_train.values, y_train.values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### Train set MSE

In [15]:
y_train_predict = model.predict(X_train)
mean_squared_error(y_train, y_train_predict)

15.851852921517315

### Test set MSE

In [16]:
y_test_predict = model.predict(X_test)
mean_squared_error(y_test, y_test_predict)

20.167672754237145

## Train decision tree regressor on the Maths dataset

In [17]:
from sklearn.tree import DecisionTreeRegressor

In [18]:
model = DecisionTreeRegressor(min_samples_leaf=10, min_impurity_decrease=0.1, max_features=10, min_samples_split=10, max_depth=12)
model.fit(X_train.values, y_train.values)

DecisionTreeRegressor(criterion='mse', max_depth=12, max_features=10,
           max_leaf_nodes=None, min_impurity_decrease=0.1,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

### Train set MSE

In [19]:
y_train_predict = model.predict(X_train)
mean_squared_error(y_train, y_train_predict)

16.931361506192083

### Test set MSE

In [20]:
y_test_predict = model.predict(X_test)
mean_squared_error(y_test, y_test_predict)

18.202738109813712