In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
def save_model(model, name = "model.pickle"):
    # Save model into pickle file
    with open(name, 'wb') as dump_file:
        pickle.dump(model, dump_file)

In [3]:
def load_model(name='model.pickle'):
    # Load pickle model
    pickle_in = open(name, 'rb')
    return pickle.load(pickle_in)

In [4]:
# Import data sets
stud = pd.read_csv("student_data.csv")
test = pd.read_csv("test_data.csv")

In [5]:
# Remove index column
stud = stud.iloc[:,1:]
test = test.iloc[:,1:]

In [6]:
# Clean sets
def _clean(df):
    # Merge "yes" "no" columns
    for c in df.columns:
        if "_yes" in c:
            name = c.split("_")[0]
            df[name] = df[c]
            df.drop(columns=[c, f"{name}_no"], axis = 1, inplace = True)
            
    # Merge "school_ms" "school_gp"
    df["school"] = df["school_GP"]
    df.drop(columns=["school_MS", "school_GP"], axis = 1, inplace = True)
    
    #Merge "sex_m" "sex_f"
    df["sex"] = df["sex_M"]
    df.drop(columns=["sex_M", "sex_F"], axis = 1, inplace = True)
    
    #Merge

In [7]:
_clean(stud)
_clean(test)

In [8]:
stud.columns

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G3',
       'address_R', 'address_U', 'famsize_GT3', 'famsize_LE3', 'Pstatus_A',
       'Pstatus_T', 'Mjob_at_home', 'Mjob_health', 'Mjob_other',
       'Mjob_services', 'Mjob_teacher', 'Fjob_at_home', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_course',
       'reason_home', 'reason_other', 'reason_reputation', 'guardian_father',
       'guardian_mother', 'guardian_other', 'schoolsup', 'famsup', 'paid',
       'activities', 'nursery', 'higher', 'internet', 'romantic', 'school',
       'sex'],
      dtype='object')

In [9]:
# Separate df in features and labels
X = stud.drop(columns="G3", axis=1)
y = stud["G3"]

In [10]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [15]:
# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [22]:
# Random forest regressor
rr = RandomForestRegressor()
rr.fit(X_train, y_train)

# Get model score
predicted = rr.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, predicted))
print("RMSE:", rmse)

RMSE: 2.7526187697796294


In [23]:
save_model(rr)

In [24]:
pickle_model = load_model()
_test = sc.fit_transform(test)
predictions = pickle_model.predict(_test)

print(predictions)

results_df = pd.DataFrame(data=predictions, columns=["G3"])
results_df.to_csv('submission.csv',index=True, index_label="index")

[13.03 13.88 13.4  10.86 14.15 12.16  9.07 12.25 13.92 12.31  8.63 12.98
 12.87 12.71 12.16  7.72 13.62 11.07  7.56  8.81 12.94 13.18 12.83 12.93
 12.52  8.15 11.48 12.25 13.86  8.98 11.3   9.59 11.8  10.   12.68 12.52
 13.77 13.13 11.14 11.81 13.21 13.2  13.52  9.64  9.   11.6  12.15 10.96
 10.29 13.3  11.65 10.05 10.26 11.62 12.4  14.21 11.11 12.51 10.25 12.37
  8.57 12.66 12.53 10.52 13.33  9.36 14.37 14.17  9.63  8.32  7.89  9.55
 12.57  9.06 11.8  12.68 10.66 13.4  13.39 12.16 12.98  6.96 11.93  9.73
  9.71 11.87 13.12 10.15  9.04 13.78  9.75 13.81 12.34 13.8  11.64 13.75
 10.64 12.6   9.16  7.68  8.69 14.46  7.85 12.82]
