### Regression

In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import statsmodels.api as sm
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import label_binarize

In [2]:
mat = pd.read_csv("student/student-mat.csv", sep=';')
por = pd.read_csv("student/student-por.csv", sep=';')

In [3]:
mat['subject'] = 'Math'
por['subject'] = 'Portuguese'

In [4]:
df = pd.concat([mat,por])

In [5]:
df.columns = ['school','sex','age','address','family_size','parents_status','mother_education','father_education',
           'mother_job','father_job','reason','guardian','commute_time','study_time','failures','school_support',
          'family_support','paid_classes','activities','nursery','desire_higher_edu','internet','romantic','family_quality',
          'free_time','go_out','weekday_alcohol_usage','weekend_alcohol_usage','health','absences','period1_score','period2_score','final_score',  'subject']

In [6]:
df['final_grade'] = 'na'
df.loc[(df.final_score >= 10) & (df.final_score <= 20), 'final_grade'] = 1 
df.loc[(df.final_score >= 0) & (df.final_score <= 10), 'final_grade'] = 0 
df.head(5)

Unnamed: 0,school,sex,age,address,family_size,parents_status,mother_education,father_education,mother_job,father_job,...,go_out,weekday_alcohol_usage,weekend_alcohol_usage,health,absences,period1_score,period2_score,final_score,subject,final_grade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,1,1,3,6,5,6,6,Math,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,1,1,3,4,5,5,6,Math,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,2,2,3,3,10,7,8,10,Math,0
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,1,1,5,2,15,14,15,Math,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,2,1,2,5,4,6,10,10,Math,0


In [7]:
dfr = df.copy()

In [8]:
dfr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1044 entries, 0 to 648
Data columns (total 35 columns):
school                   1044 non-null object
sex                      1044 non-null object
age                      1044 non-null int64
address                  1044 non-null object
family_size              1044 non-null object
parents_status           1044 non-null object
mother_education         1044 non-null int64
father_education         1044 non-null int64
mother_job               1044 non-null object
father_job               1044 non-null object
reason                   1044 non-null object
guardian                 1044 non-null object
commute_time             1044 non-null int64
study_time               1044 non-null int64
failures                 1044 non-null int64
school_support           1044 non-null object
family_support           1044 non-null object
paid_classes             1044 non-null object
activities               1044 non-null object
nursery                  1

In [9]:
def preprocessing_data(data):
    bin_dict = {'no': 0, 'yes': 1}
    data['sex'] = data['sex'].map({'F': 0, 'M': 1})
    bin_features = ['school_support', 'family_support', 'paid_classes', 'activities', 'nursery', 'desire_higher_edu', 'internet', 'romantic']
    for feature in bin_features:
        data[feature] = data[feature].map(bin_dict)
    dummies_features = ['school', 'address', 'family_size', 'parents_status', 'mother_job', 'father_job', 'reason', 'guardian', 'subject']
    for feature in dummies_features:
        data = pd.concat((data, pd.get_dummies(data[feature], prefix=feature + '_')),1)
        data = data.drop([feature], axis=1)
    return (data)

In [10]:
dfr = preprocessing_data(dfr)

In [11]:
X = dfr.drop('final_grade',axis=1)
y = dfr.final_grade

In [12]:
columns = np.array(X.columns.values)

In [13]:
sm = SMOTE()
X, y = sm.fit_sample(X, y)

In [14]:
X = pd.DataFrame(X)
X.columns = columns

In [15]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(X,test_size=0.3)

In [16]:
X_train = train.drop('final_score',axis=1)
y_train = train.final_score

X_test = test.drop('final_score',axis=1)
y_test = test.final_score

In [17]:
X_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 925 entries, 1299 to 1042
Data columns (total 51 columns):
sex                      925 non-null int64
age                      925 non-null int64
mother_education         925 non-null int64
father_education         925 non-null int64
commute_time             925 non-null int64
study_time               925 non-null int64
failures                 925 non-null int64
school_support           925 non-null int64
family_support           925 non-null int64
paid_classes             925 non-null int64
activities               925 non-null int64
nursery                  925 non-null int64
desire_higher_edu        925 non-null int64
internet                 925 non-null int64
romantic                 925 non-null int64
family_quality           925 non-null int64
free_time                925 non-null int64
go_out                   925 non-null int64
weekday_alcohol_usage    925 non-null int64
weekend_alcohol_usage    925 non-null int64
health     

In [18]:
def train_predict_model(model, X_train, y_train, X_test, y_test):
    #scoring = ['r2', 'neg_mean_squared_error']
    cv = StratifiedKFold(n_splits=5)
    #scores = cross_validate(model, X, y, scoring=scoring, cv=cv, return_train_score=False)
    scores1 = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error').mean()
    #ypred = cross_val_predict(model, X, y, cv=cv)    
    #mean_squared_error = metrics.mean_squared_error(y, ypred)
    print ("NMSE, cross_val_score : {:.2f}".format(scores1))
    #print("MSE, cross_val_predict : {:.2f}".format(mean_squared_error))
    #sorted(scores.keys())
    #print("R^2 : {:.2f}".format(scores['test_r2'].mean()))
    #print("MSE : {:.2f}".format(scores['test_neg_mean_squared_error'].mean()))
    model = model.fit(X_train, y_train)
    ypred = model.predict(X_test)
    mean_squared_error = metrics.mean_squared_error(y_test, ypred)
    print("MSE, test : {:.2f}".format(mean_squared_error))

### KNN

In [34]:
reg = KNeighborsRegressor(n_neighbors=3, metric = 'euclidean', weights = 'distance')
train_predict_model(reg, X_train, y_train, X_test, y_test)

NMSE, cross_val_score : -2.48
MSE, test : 2.11




In [20]:
from sklearn.model_selection import GridSearchCV, cross_val_score

knn_params = {'n_neighbors': [3,5,7], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']}
knn_grid = GridSearchCV(KNeighborsRegressor(), knn_params,cv=5, n_jobs=-1,verbose=True,scoring='neg_mean_squared_error')
knn_grid.fit(X_train, y_train)

print(knn_grid.best_params_)
print(knn_grid.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    2.6s remaining:    0.8s


{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
-2.3886453713526814


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.8s finished


### Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
train_predict_model(lr, X_train, y_train, X_test, y_test)

NMSE, cross_val_score : -2.81
MSE, test : 2.95




### Ridge

In [22]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.1)
train_predict_model(ridge,X_train, y_train, X_test, y_test)

NMSE, cross_val_score : -2.81
MSE, test : 2.94




### Lasso

In [23]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.1)
train_predict_model(lasso, X_train, y_train, X_test, y_test)



NMSE, cross_val_score : -2.70
MSE, test : 2.89


### Decision tree

In [35]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(max_depth=7, min_samples_leaf = 5)
train_predict_model(tree, X_train, y_train, X_test, y_test)

NMSE, cross_val_score : -2.60
MSE, test : 2.34




In [25]:
from sklearn.model_selection import GridSearchCV, cross_val_score

tree_params = {'max_depth': [3,5,7,9],'min_samples_leaf': [3,5]}

tree_grid = GridSearchCV(DecisionTreeRegressor(), tree_params,cv=5, n_jobs=-1,verbose=True, scoring='neg_mean_squared_error')
tree_grid.fit(X_train, y_train)
print(tree_grid.best_params_)
print(tree_grid.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


{'max_depth': 7, 'min_samples_leaf': 5}
-2.125162158256766


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.1s finished


### Random Forest

In [26]:
forest = RandomForestRegressor(n_estimators=500, min_samples_leaf= 3, max_depth=9)
train_predict_model(forest,X_train, y_train, X_test, y_test)



NMSE, cross_val_score : -1.74
MSE, test : 2.10


In [27]:
tree_params = {'max_depth': [3,5,7,9],'min_samples_leaf': [3,5], 'n_estimators': [100, 200, 500]}
grid_search = GridSearchCV(RandomForestRegressor(), tree_params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'max_depth': 9, 'min_samples_leaf': 3, 'n_estimators': 500}
-1.7424466506884164


### Xgboost

In [28]:
!pip install xgboost



In [29]:
import xgboost
xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.2, max_depth=3)
train_predict_model(xgb,X_train, y_train, X_test, y_test)

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


NMSE, cross_val_score : -2.06
MSE, test : 1.99


  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [30]:
xgb_params = {'max_depth': [3,5,7,9],'learning_rate': [0.001, 0.01, 0.1, 0.2], 'n_estimators': [100, 200, 500]}
grid_search = GridSearchCV(xgboost.XGBRegressor(), xgb_params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

print(grid_search.best_params_)
print(grid_search.best_score_)

{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
-8.961651639296197e-10


In [31]:
X_train.columns

Index(['sex', 'age', 'mother_education', 'father_education', 'commute_time',
       'study_time', 'failures', 'school_support', 'family_support',
       'paid_classes', 'activities', 'nursery', 'desire_higher_edu',
       'internet', 'romantic', 'family_quality', 'free_time', 'go_out',
       'weekday_alcohol_usage', 'weekend_alcohol_usage', 'health', 'absences',
       'period1_score', 'period2_score', 'school__GP', 'school__MS',
       'address__R', 'address__U', 'family_size__GT3', 'family_size__LE3',
       'parents_status__A', 'parents_status__T', 'mother_job__at_home',
       'mother_job__health', 'mother_job__other', 'mother_job__services',
       'mother_job__teacher', 'father_job__at_home', 'father_job__health',
       'father_job__other', 'father_job__services', 'father_job__teacher',
       'reason__course', 'reason__home', 'reason__other', 'reason__reputation',
       'guardian__father', 'guardian__mother', 'guardian__other',
       'subject__Math', 'subject__Portuguese'],


In [32]:
fi = pd.DataFrame({'feature': X_train.columns, 'importance': xgb.feature_importances_}).sort_values(by='importance', ascending=False)

In [33]:
fi

Unnamed: 0,feature,importance
23,period2_score,0.54858
49,subject__Math,0.073322
21,absences,0.067445
43,reason__home,0.03718
22,period1_score,0.028231
50,subject__Portuguese,0.02295
42,reason__course,0.015914
8,family_support,0.013626
6,failures,0.010953
14,romantic,0.008427
