In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score, make_scorer
from sklearn.preprocessing import normalize, LabelEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression


import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def save_model(model, name = "model.pickle"):
    # Save model into pickle file
    with open(name, 'wb') as dump_file:
        pickle.dump(model, dump_file)

In [3]:
def load_model(name='model.pickle'):
    # Load pickle model
    pickle_in = open(name, 'rb')
    return pickle.load(pickle_in)

In [4]:
# Import data sets
stud = pd.read_csv("student_data.csv")
test = pd.read_csv("test_data.csv")

stud = stud.iloc[:,1:]
test = test.iloc[:,1:]

# Convert column names to lowercase
stud = stud.rename(columns=str.lower)
test = test.rename(columns=str.lower)

In [5]:
stud.describe()

Unnamed: 0,age,medu,fedu,traveltime,studytime,failures,famrel,freetime,goout,dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
count,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,...,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0,545.0
mean,16.719266,2.513761,2.297248,1.563303,1.946789,0.192661,3.941284,3.163303,3.198165,1.506422,...,0.493578,0.506422,0.192661,0.807339,0.106422,0.893578,0.222018,0.777982,0.616514,0.383486
std,1.194386,1.118155,1.07772,0.752528,0.839574,0.553682,0.947249,1.050618,1.160469,0.929725,...,0.500418,0.500418,0.394751,0.394751,0.30866,0.30866,0.415985,0.415985,0.486682,0.486682
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16.0,2.0,1.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
50%,17.0,2.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
max,21.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
test.describe()

Unnamed: 0,age,medu,fedu,traveltime,studytime,failures,famrel,freetime,goout,dalc,...,activities_no,activities_yes,nursery_no,nursery_yes,higher_no,higher_yes,internet_no,internet_yes,romantic_no,romantic_yes
count,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,...,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0,104.0
mean,16.875,2.519231,2.355769,1.596154,1.846154,0.375,3.875,3.269231,3.115385,1.480769,...,0.625,0.375,0.221154,0.778846,0.105769,0.894231,0.288462,0.711538,0.711538,0.288462
std,1.334294,1.222609,1.214142,0.730995,0.772956,0.752826,1.001819,1.054171,1.256499,0.902862,...,0.486467,0.486467,0.417034,0.417034,0.309031,0.309031,0.455241,0.455241,0.455241,0.455241
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,16.0,1.0,1.0,1.0,1.0,0.0,3.0,3.0,2.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
75%,18.0,4.0,4.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
def encode(df):
    for c in df.columns:
        if c != "g3":
            le = LabelEncoder()
            df[c] =le.fit_transform(df[c])

In [8]:
encode(stud)
encode(test)

In [9]:
#X = stud[["studytime", "absences", "failures"]]
X = stud.drop(columns=["g3"], axis=1)
y = stud["g3"]
X_columns = X.columns

In [10]:
def stats(model, features_test, labels_test):
    predicted = model.predict(features_test)
    s = model.score(features_test, labels_test)
    rmse = sqrt(mean_squared_error(labels_test, predicted))
    r2 = r2_score(labels_test , predicted)
    return s, rmse, r2

In [11]:
# Split data into test and training set, to be able to see accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.6, random_state=0)

In [12]:
def train_model(t_rmse):
    params = {
   "n_estimators":[5,7],
    "max_depth":[5, 10]
    }
    last_rmse = 1000
    rfr = RandomForestRegressor()
    mse = make_scorer(mean_squared_error, greater_is_better=False)
    rf = GridSearchCV(rfr, params, scoring = mse)
    while (last_rmse > t_rmse):
        model = rf.fit(X_train, y_train)
        predicted = model.predict(X_test)
        rmse = sqrt(mean_squared_error(y_test, predicted))
        if (rmse < last_rmse):
            last_rmse = rmse
            save_model(model)
            print(rmse)

In [14]:
train_model(3)

2.792610334031667


In [15]:
pickle_model = load_model()
predictions = pickle_model.predict(test)
#predictions = pickle_model.predict(test[["studytime", "absences", "failures"]])

print(predictions.shape)
print(predictions)

results_df = pd.DataFrame(data=predictions, columns=["G3"])
results_df.to_csv('submission.csv',index=True, index_label="index")

(104,)
[13.80628956 13.51246829 12.75319833 11.25811434 13.77518014 13.31452601
  7.04199735 13.38235248 14.44830348 13.68367367  6.7970994  13.00890816
 13.20551805 14.35679701 12.96428947  6.9973545  13.95707767 13.29197422
  5.68102797  5.39893013 13.04416248 13.73894308 12.82235797 13.66168011
 13.19495059  9.36342593 13.02100474 14.13212084 13.93087673  6.73247354
 11.70795312  8.66190476 12.67749475 12.83521658 12.95600183 12.97314608
 11.93199601 12.96428947 12.15270607 12.2712728  12.73815598 13.93087673
 13.46043601  7.4         8.7614418  12.5312939  12.51282276 11.73299757
  9.85449735 12.52780446 10.85470729 11.18951717  9.21390779 13.53547841
 13.19495059 13.25383317 12.05693906 12.78710365 11.12826618 13.71838267
  7.77414021 12.49145233 12.55893233 12.14901724 13.91092051  9.30514706
 14.44830348 13.47408213  7.65964441  7.0730042   5.32771164  7.88941799
 12.65321851  9.4388369  10.83847148 13.93087673 12.18900171 13.47563768
 13.17439018 11.28765596 13.19495059  6.6444