In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Reads data stored in the student_dropout.csv file which contains our pruned data set
raw_data = pd.read_csv('../data/student_dropout_test.csv')

# Sets up variables to hold the X, y and the names of columns for the variables in X
y = raw_data['Target']
raw_data = raw_data.drop(columns=['Target'])
column_names = raw_data.columns
X = raw_data.loc[:, column_names]

# Creates the train test split for the data using a 75:25 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

In [44]:
model.score(X_test, y_test)

0.7719869706840391

In [45]:
importances = dict(zip(column_names, model.feature_importances_))
importances = {k: v for k, v in sorted(importances.items(), key=lambda X_filtered: X_filtered[1], reverse=True)}

In [46]:
importances

{'Curricular units 2nd sem (approved)': 0.22899330471373655,
 'Curricular units 2nd sem (grade)': 0.13325205088657407,
 'Curricular units 2nd sem (evaluations)': 0.07609042036443407,
 'Admission grade': 0.05399090707706609,
 'Previous qualification (grade)': 0.05215982444685561,
 'Age at enrollment': 0.0509391127089939,
 'Course': 0.042455192579119394,
 'Tuition fees up to date': 0.04202527527328145,
 "Mother's occupation": 0.03727827996157894,
 'Unemployment rate': 0.030980230435906354,
 "Father's qualification": 0.030350581113845275,
 'Application mode': 0.030024155576527514,
 'GDP': 0.028977390217080377,
 'Curricular units 2nd sem (enrolled)': 0.028813826371469137,
 "Mother's qualification": 0.028354910461369686,
 'Application order': 0.021912787742029812,
 'Scholarship holder': 0.02132307373444991,
 'Inflation rate': 0.017464303723474538,
 'Gender': 0.015169616865340454,
 'Debtor': 0.011491323933662988,
 'Displaced': 0.010691673306836054,
 'Previous qualification': 0.00726175850636

In [47]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")


Accuracy: 0.7720


In [48]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import joblib

# Reads data stored in the student_dropout.csv file which contains our pruned data set
raw_data = pd.read_csv('student_dropout.csv')

# Sets up variables to hold the X, y and the names of columns for the variables in X
y = raw_data['Target']
raw_data = raw_data.drop(columns=['Target'])
column_names = raw_data.columns
X = raw_data.loc[:, column_names]

# Creates the train test split for the data using a 75:25 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)

joblib.dump(model, 'models/random_tree_model.pkl')

['models/random_tree_model.pkl']

In [49]:
model.score(X_test, y_test)

0.7328990228013029

In [50]:
importances = dict(zip(column_names, model.feature_importances_))
importances = {k: v for k, v in sorted(importances.items(), key=lambda X_filtered: X_filtered[1], reverse=True)}

In [51]:
importances

{'Curricular units 2nd sem (approved)': 0.29267470663892986,
 'Curricular units 2nd sem (grade)': 0.18794375308828917,
 'Curricular units 2nd sem (evaluations)': 0.1439877401399991,
 'Age at enrollment': 0.13968597596963866,
 'Unemployment rate': 0.12076864355922673,
 'Tuition fees up to date': 0.06177839678336267,
 'Curricular units 2nd sem (enrolled)': 0.05316078382055386}