In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score, make_scorer
from sklearn.preprocessing import normalize, LabelEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression


import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def save_model(model, name = "model.pickle"):
    # Save model into pickle file
    with open(name, 'wb') as dump_file:
        pickle.dump(model, dump_file)

In [3]:
def load_model(name='model.pickle'):
    # Load pickle model
    pickle_in = open(name, 'rb')
    return pickle.load(pickle_in)

In [4]:
# Import data sets
stud = pd.read_csv("student_data.csv")
test = pd.read_csv("test_data.csv")

stud = stud.iloc[:,1:]
test = pd.read_csv("test_data.csv")

# Convert column names to lowercase
stud = stud.rename(columns=str.lower)
test = stud.rename(columns=str.lower)

In [5]:
for c in stud.columns:
    if  "_yes" in c:
        name = c.split("_")[0]
        stud[name] = stud[c]
        stud = stud.drop(columns = [c, f"{name}_no"])

In [6]:
stud["school"] = stud["school_ms"]
stud = stud.drop(columns = ["school_ms","school_gp"])

In [7]:
def get_guardian(row):
    guardians = {0: "father",
                1: "mother",
                2: "other"}
    guardians_list = [row["guardian_father"], row["guardian_mother"],row["guardian_other"]]
    return guardians[guardians_list.index(max(guardians_list))]

stud["guardian"] = stud.apply(get_guardian, axis=1)
stud["guardian"]

stud = stud.drop(columns=["guardian_father","guardian_mother", "guardian_other"])

In [8]:
def get_mjob(row):
    jobs = {0: "home",
                1: "health",
                2: "other",
                3:"services",
                4:"teacher"}
    
    jobs_list = [
        row["mjob_at_home"],
        row["mjob_health"],
        row["mjob_other"],
        row["mjob_services"],
        row["mjob_teacher"],
    ]
    return jobs[jobs_list.index(max(jobs_list))]

def get_fjob(row):
    jobs = {0: "home",
                1: "health",
                2: "other",
                3:"services",
                4:"teacher"}
    
    jobs_list = [
        row["fjob_at_home"],
        row["fjob_health"],
        row["fjob_other"],
        row["fjob_services"],
        row["fjob_teacher"],
    ]
    return jobs[jobs_list.index(max(jobs_list))]

stud["mjob"] = stud.apply(get_mjob, axis=1)
stud["fjob"] = stud.apply(get_fjob, axis=1)
stud = stud.drop(columns=["fjob_at_home",
            "fjob_health",
            "fjob_other",
            "fjob_services",
            "fjob_teacher",
            "mjob_at_home",
            "mjob_health",
            "mjob_other",
            "mjob_services",
            "mjob_teacher"])

In [9]:
stud["famsize"] = stud["famsize_gt3"]
stud = stud.drop(columns=["famsize_gt3", "famsize_le3"])

In [10]:
stud["sex"] = stud["sex_m"]
stud = stud.drop(columns=["sex_m", "sex_f"])

In [11]:
stud["address"] = stud["address_r"]
stud = stud.drop(columns=["address_r", "address_u"])

In [12]:
stud["pstatus"] = stud["pstatus_a"]
stud = stud.drop(columns=["pstatus_a", "pstatus_t"])

In [13]:
def get_reason(row):
    reasons = {
        0: "course",
        1: "home",
        2: "other",
        3:"reputation"
    }
    reasons_list = [row["reason_course"], row["reason_home"],row["reason_other"],row["reason_reputation"]]
    return reasons[reasons_list.index(max(reasons_list))]

stud["reason"] = stud.apply(get_reason, axis=1)

stud = stud.drop(columns=["reason_course","reason_home", "reason_other", "reason_reputation"])

In [14]:
stud.columns

Index(['age', 'medu', 'fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'dalc', 'walc', 'health', 'absences', 'g3',
       'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher',
       'internet', 'romantic', 'school', 'guardian', 'mjob', 'fjob', 'famsize',
       'sex', 'address', 'pstatus', 'reason'],
      dtype='object')

In [32]:
stud.head()

Unnamed: 0,age,medu,fedu,traveltime,studytime,failures,famrel,freetime,goout,dalc,...,romantic,school,guardian,mjob,fjob,famsize,sex,address,pstatus,reason
0,15,4,2,1,1,0,3,5,2,1,...,0,0,1,4,2,0,1,0,0,0
1,18,2,1,2,2,0,4,3,5,1,...,1,0,1,2,2,1,0,1,0,3
2,17,3,3,2,4,1,5,4,5,3,...,0,1,1,3,1,1,1,0,0,0
3,16,3,3,1,3,0,4,3,3,1,...,1,0,1,2,2,1,0,0,0,1
4,17,1,1,2,1,0,4,4,5,1,...,1,1,1,1,2,1,0,1,0,3


In [16]:
le = LabelEncoder()
categorical = ["reason", "guardian", "fjob", "mjob"]
for c in categorical:
    stud[c] = le.fit_transform(stud[c])

In [31]:
# X = stud.drop(columns = ["studytime", "absences", "failures"], axis = 1)
X = stud.drop(columns=["g3"])
y = stud["g3"]
X_columns = X.columns

In [25]:
X = normalize(X)

In [19]:
def stats(model, features_test, labels_test):
    predicted = model.predict(features_test)
    s = model.score(features_test, labels_test)
    rmse = sqrt(mean_squared_error(labels_test, predicted))
    r2 = r2_score(labels_test , predicted)
    return s, rmse, r2

In [26]:
# Split data into test and training set, to be able to see accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state=42)

In [21]:
X_train.shape, X_test.shape

((381, 30), (164, 30))

In [27]:
base_rf = RandomForestRegressor()
base_model = base_rf.fit(X_train, y_train)

print("Score, RMSE, R2Score")
score1 = stats(base_rf, X_test, y_test)
print(score1)

Score, RMSE, R2Score
(0.22192425876598243, 3.0185394681463675, 0.22192425876598243)


In [23]:
params = {
   'n_estimators': [200,500],
    "max_features":["sqrt", None],
    "random_state":[42]
}

mse = make_scorer(mean_squared_error, greater_is_better=False)
clf = GridSearchCV(RandomForestRegressor(), params, scoring = mse, cv= 5)
clf.fit(X_train, y_train)

print("Score, RMSE, R2Score")
score2 = stats(clf, X_test, y_test)
print(score2)
print(f"Best params:\n{clf.best_params_}")

KeyboardInterrupt: 

In [None]:
_params = clf.best_params_

In [None]:
importances = pd.Series(base_model.feature_importances_, index = X_columns)
importances

In [None]:
# Select top features
top = 3
important_features = importances.abs().sort_values(ascending=False)[:top]

plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(important_features)))

# Make a bar chart
plt.bar(x_values, important_features, orientation = 'vertical')

# Tick labels for x axis
plt.xticks(x_values, important_features.index, rotation='vertical', fontsize=20)

# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');

In [None]:
X_important = X_train[important_features.index]
X_important_test = X_test[important_features.index]

X_important.head()

In [None]:
rf2 = RandomForestRegressor(
    max_features=_params["max_features"],
    n_estimators=_params["n_estimators"])
model_important = rf2.fit(X_important, y_train)

print("Score, RMSE, R2Score")
score3 = stats(model_important, X_important_test, y_test)
print(score3)

In [None]:
pickle_model = load_model()
predictions = pickle_model.predict(test_df[["studytime", "failures", "absences"]])
print(predictions)

results_df = pd.DataFrame(data=predictions, columns=["G3"])
results_df.to_csv('submission.csv',index=True, index_label="index")