In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error,r2_score, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import RFECV, SelectFromModel

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
def save_model(model, name = "model.pickle"):
    # Save model into pickle file
    with open(name, 'wb') as dump_file:
        pickle.dump(model, dump_file)

In [3]:
def load_model(name='model.pickle'):
    # Load pickle model
    pickle_in = open(name, 'rb')
    return pickle.load(pickle_in)

In [4]:
# Import data sets
stud = pd.read_csv("student_data.csv")
test = pd.read_csv("test_data.csv")

In [5]:
# Remove index column
stud = stud.iloc[:,1:]
test = test.iloc[:,1:]

In [6]:
def get_guardian(row):
    guardians_list = [row["guardian_father"], row["guardian_mother"],row["guardian_other"]]
    return guardians_list.index(max(guardians_list))

In [7]:
def get_mjob(row):
    jobs_list = [
        row["Mjob_at_home"],
        row["Mjob_health"],
        row["Mjob_other"],
        row["Mjob_services"],
        row["Mjob_teacher"],
    ]
    return jobs_list.index(max(jobs_list))

In [8]:
def get_fjob(row):
    jobs_list = [
        row["Fjob_at_home"],
        row["Fjob_health"],
        row["Fjob_other"],
        row["Fjob_services"],
        row["Fjob_teacher"],
    ]
    return jobs_list.index(max(jobs_list))

In [9]:
# Clean sets
def _clean(df):
    # Merge "yes" "no" columns
    for c in df.columns:
        if "_yes" in c:
            name = c.split("_")[0]
            df[name] = df[c]
            df.drop(columns=[c, f"{name}_no"], axis = 1, inplace = True)
            
    # Merge "school_ms" "school_gp"
    df["school"] = df["school_GP"]
    df.drop(columns=["school_MS", "school_GP"], axis = 1, inplace = True)
    
    #Merge "sex_m" "sex_f"
    df["sex"] = df["sex_M"]
    df.drop(columns=["sex_M", "sex_F"], axis = 1, inplace = True)
    
    #Merge guardian
    df["guardian"] = df.apply(get_guardian, axis=1)
    df.drop(columns=["guardian_father","guardian_mother", "guardian_other"], axis = 1, inplace = True)

    #Merge famsize
    df["famsize"] = df["famsize_GT3"]
    df.drop(columns=["famsize_GT3", "famsize_LE3"], axis=1, inplace= True)
    
    #Merge addrs
    df["address"] = df["address_R"]
    df.drop(columns=["address_R", "address_U"], axis=1, inplace= True)
    
    #Merge Pstatus
    df["pstatus"] = df["Pstatus_A"]
    df.drop(columns=["Pstatus_A", "Pstatus_T"], axis=1, inplace= True)
    
    #Merge jobs
    df["mjob"] = df.apply(get_mjob, axis=1)
    df["fjob"] = df.apply(get_fjob, axis=1)
    df.drop(columns=[
            "Fjob_at_home",
            "Fjob_health",
            "Fjob_other",
            "Fjob_services",
            "Fjob_teacher",
            "Mjob_at_home",
            "Mjob_health",
            "Mjob_other",
            "Mjob_services",
            "Mjob_teacher"], axis=1, inplace=True)

In [10]:
_clean(stud)
_clean(test)

In [11]:
# Separate df in features and labels
X = stud.drop(columns="G3", axis=1)
features = X.columns
y = stud["G3"]

In [13]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [14]:
# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [15]:
# Random forest regressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Get model score
predicted = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, predicted))
print("RMSE:", rmse)

RMSE: 2.712373781919947


In [17]:
save_model(rf)

In [19]:
pickle_model = load_model()
_test = sc.fit_transform(test)
predictions = pickle_model.predict(_test)

print(predictions)

results_df = pd.DataFrame(data=predictions, columns=["G3"])
results_df.to_csv('submission.csv',index=True, index_label="index")

[13.   13.24 13.38 10.46 13.79 11.5   8.83 11.99 12.98 11.54  8.09 12.96
 12.78 12.67 12.01  7.87 12.71 11.18  7.98  9.53 12.64 13.05 12.73 12.78
 13.09  8.27 11.64 12.58 13.29  9.02 11.5   9.75 11.45 11.28 12.   12.65
 13.54 13.08 11.49 11.85 13.25 13.5  13.69  9.56  8.78 11.73 11.88 10.41
 10.   13.65 11.91 10.14 10.07 11.81 12.8  13.94 11.55 12.95 10.81 12.61
  8.61 13.26 11.91 10.53 13.37  9.38 14.58 14.59  9.6   8.67  8.79  9.33
 12.08  9.48 11.03 12.58 10.21 13.89 13.42 11.26 12.92  7.77 12.02 10.13
  9.78 11.88 13.39  9.91 10.64 13.86  9.77 13.08 12.17 13.33 11.42 14.02
 10.35 12.56  9.43  7.08  8.3  15.03  8.48 11.95]
