In [4]:
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn import model_selection, preprocessing
from pyearth import Earth

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.metrics._plot.confusion_matrix import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, r2_score, mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

In [5]:
url = 'https://raw.githubusercontent.com/vea-therese/foreshift/main/data-new-with-course-labeling.csv'
df = pd.read_csv(url)

In [6]:
dictionary = [
{'col': 'Decision_Making1', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Decision_Making2', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Decision_Making3', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Decision_Making4', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Decision_Making5', 'mapping': {np.nan: 0, 'Myself': 1, 'Family': 2, 'Friends and/or boyfriend/girlfriend': 3, 'Institution': 4, 'Teachers': 5, 'Others': 6}},

{'col': 'Personal_Assessment1', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Personal_Assessment2', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Personal_Assessment3', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Personal_Assessment4', 'mapping': {np.nan: 0, 'Myself': 1, 'Family': 2, 'Friends and/or boyfriend/girlfriend': 3, 'Institution': 4, 'Teachers': 5, 'Professionals': 6, 'Socioeconomic': 7, 'Others': 8}},
{'col': 'Personal_Assessment5', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},

{'col': 'Course_Environment1', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Course_Environment2', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Course_Environment3', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Course_Environment4', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Course_Environment5', 'mapping': {'10-30': 1, '31-50': 2, '51-70': 3,  '70+': 4}},

{'col': 'Course_Satisfaction1', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Course_Satisfaction2', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Course_Satisfaction3', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Course_Satisfaction4', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Course_Satisfaction5', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},

{'col': 'Academic_Experience1', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Academic_Experience2', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Academic_Experience3', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Academic_Experience4', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},
{'col': 'Academic_Experience5', 'mapping': {np.nan: 0, 'Strongly Disagree': 1, 'Disagree': 2, 'Agree': 3, 'Strongly Agree': 4}},


{'col': 'Year', 'mapping': {'First Year': 1, 'Second Year': 2, 'Third Year': 3,  'Fourth Year': 4,  'Fifth Year': 5}},
{'col': 'Same_Course', 'mapping': {'Yes': 1, 'No': 0}},
{'col': 'Different_Course', 'mapping': {'Yes': 1, 'No': 0}}
]

In [7]:
encoder = ce.OrdinalEncoder(cols=['Decision_Making1', 'Decision_Making2', 'Decision_Making3', 'Decision_Making4', 'Personal_Assessment1', 'Personal_Assessment2', 
'Personal_Assessment3', 'Personal_Assessment5', 'Course_Environment1', 'Course_Environment2', 'Course_Environment3', 'Course_Environment4',
'Course_Environment5', 'Course_Satisfaction1', 'Course_Satisfaction2', 'Course_Satisfaction3', 'Course_Satisfaction4', 'Course_Satisfaction5',
'Academic_Experience1', 'Academic_Experience2', 'Academic_Experience3', 'Academic_Experience4', 'Academic_Experience5',  'Year', 'Same_Course', 'Different_Course', 'Decision_Making5', 'Personal_Assessment4'], mapping=dictionary)

In [8]:
df_new = encoder.fit_transform(df)
lab = LabelEncoder()
df_new['Sex'] = lab.fit_transform(df_new['Sex'])
df_new['Current_Course'] = lab.fit_transform(df_new['Current_Course'])
df_new['Year'] = lab.fit_transform(df_new['Year'])
df_new['SHS_Strand'] = lab.fit_transform(df_new['SHS_Strand'])
df_new['First_Choice'] = lab.fit_transform(df_new['First_Choice'])
df_new['Second_Choice'] = lab.fit_transform(df_new['Second_Choice'])
df_new['Third_Choice'] = lab.fit_transform(df_new['Third_Choice'])

In [9]:
df_new
df_new.to_csv('cleaned-data.csv')

Unnamed: 0,Sex,Year,Current_Course,SHS_Strand,First_Choice,Second_Choice,Third_Choice,Decision_Making1,Decision_Making2,Decision_Making3,...,Course_Satisfaction3,Course_Satisfaction4,Course_Satisfaction5,Academic_Experience1,Academic_Experience2,Academic_Experience3,Academic_Experience4,Academic_Experience5,Same_Course,Different_Course
0,1,1,13,4,23,14,30,4,4,4,...,3,3,3,4,4,4,4,4,1,0
1,1,2,13,0,23,16,15,3,3,3,...,2,3,3,3,2,2,3,3,1,0
2,1,2,3,4,8,6,25,4,4,4,...,3,2,2,3,2,3,2,1,0,1
3,0,2,5,0,28,8,25,4,4,3,...,3,3,3,3,2,2,3,3,1,0
4,0,2,6,3,12,28,32,3,4,4,...,2,2,3,3,1,1,3,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1597,0,2,16,4,5,7,10,4,4,4,...,3,2,2,3,3,3,3,1,1,0
1598,0,2,6,0,5,12,8,3,4,4,...,1,4,2,3,1,2,4,2,0,1
1599,0,1,17,3,32,5,38,3,2,2,...,2,2,3,4,3,3,4,2,1,0
1600,1,1,13,4,23,14,0,3,3,4,...,3,4,2,1,3,3,4,3,0,1


In [20]:
X=df_new.iloc[:,[7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]]
y=df_new.iloc[:,33]

X

Unnamed: 0,Decision_Making1,Decision_Making2,Decision_Making3,Decision_Making4,Decision_Making5,Personal_Assessment1,Personal_Assessment2,Personal_Assessment3,Personal_Assessment4,Personal_Assessment5,...,Course_Satisfaction1,Course_Satisfaction2,Course_Satisfaction3,Course_Satisfaction4,Course_Satisfaction5,Academic_Experience1,Academic_Experience2,Academic_Experience3,Academic_Experience4,Academic_Experience5
0,4,4,4,4,1,4,4,4,3.0,4,...,3,4,3,3,3,4,4,4,4,4
1,3,3,3,3,1,3,3,3,3.0,3,...,3,3,2,3,3,3,2,2,3,3
2,4,4,4,3,1,2,3,2,3.0,4,...,3,4,3,2,2,3,2,3,2,1
3,4,4,3,4,1,4,3,3,3.0,3,...,3,3,3,3,3,3,2,2,3,3
4,3,4,4,4,1,3,3,3,3.0,2,...,3,2,2,2,3,3,1,1,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1597,4,4,4,4,1,3,3,2,-1.0,3,...,3,3,3,2,2,3,3,3,3,1
1598,3,4,4,4,2,1,1,1,2.0,2,...,1,2,1,4,2,3,1,2,4,2
1599,3,2,2,3,1,3,3,3,6.0,3,...,4,3,2,2,3,4,3,3,4,2
1600,3,3,4,4,1,3,3,2,2.0,2,...,1,2,3,4,2,1,3,3,4,3


In [11]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=3)


In [12]:
model = Earth()

In [13]:
MARS = model.fit(X_train, y_train)

  pruning_passer.run()
  coef, resid = np.linalg.lstsq(B, weighted_y[:, i])[0:2]


In [14]:
predictions = MARS.predict(X_test)
print(MARS.summary())

Earth Model
-------------------------------------------
Basis Function        Pruned  Coefficient  
-------------------------------------------
(Intercept)           No      0.760443     
Personal_Assessment5  No      -0.175112    
Personal_Assessment2  No      -0.118486    
Course_Satisfaction5  No      0.0681392    
Academic_Experience5  No      0.0382991    
Decision_Making5      No      0.074579     
Decision_Making1      No      0.051661     
Academic_Experience4  Yes     None         
Course_Satisfaction3  Yes     None         
Personal_Assessment3  Yes     None         
Course_Satisfaction4  Yes     None         
Course_Environment3   Yes     None         
Personal_Assessment1  Yes     None         
Course_Satisfaction1  Yes     None         
Decision_Making4      Yes     None         
-------------------------------------------
MSE: 0.1880, GCV: 0.1931, RSQ: 0.2132, GRSQ: 0.1932


In [15]:
accuracy_score(y_test, predictions.round())

0.7456359102244389

In [16]:
precision_score(y_test, predictions.round())

0.7522123893805309

In [17]:
cm = confusion_matrix(y_test, predictions.round())
cm

array([[214,  28],
       [ 74,  85]], dtype=int64)

In [18]:
pd.to_pickle(MARS, 'mars_model.pickle')
pd.to_pickle(X, 'X_train.pickle')
pd.to_pickle(y, 'y_train.pickle')


In [21]:
row = [[4,4,4,4,1,4,4,4,1,4,4,4,4,4,2,4,4,4,4,4,4,4,4,4,4]]

#X_test
pred = MARS.predict(row)
pred
print('Prediction: %f' % pred.item(0))


Prediction: 0.293028
