In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.mode.chained_assignment = None
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
#I have used a free api to get the gender information for every name
#there were 25 names where the api couldn't guess the gender, I completed those values myself
firstName_gender = pd.read_csv('all_names_gender.csv')
firstName_gender.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,name,ga_first_name,ga_gender,ga_accuracy,ga_samples
0,0,0,abel,Abel,male,98.0,11473.0
1,1,1,achim,Achim,male,99.0,2035.0
2,2,2,ada,Ada,female,85.0,10634.0
3,3,3,adalbert,Adalbert,male,96.0,220.0
4,4,4,adam,Adam,male,99.0,101742.0


In [3]:
#same feature engineering except that a gender column is added to our dataset

df = pd.read_csv('note.csv')
df = df.reset_index()


Y = df[['rezultatul final']]

X = df[['nume','media teze nationale', 'nota la limba romana','judet8','scoala','media de absolvire','nota la matematica',
       'optional8 nota','limba materna8','promotie anterioara','specializare','limba moderna','disciplina profil','optional'
       ,'unitatea de invatamant', 'forma invatamant','judetul']]



def firstname(name):
    fullname_list = name.split(" ")
    return fullname_list[len(fullname_list)-1]

X['nume'] = X['nume'].apply(firstname)


#add gender 
genders = []

for ind, name in enumerate(X['nume']):
    all_first_names = name.split('-')
    male = 0
    female = 0
    
    for first in all_first_names:
        if not firstName_gender[firstName_gender['name'] == first]['ga_gender'].empty:
            gender = firstName_gender[firstName_gender['name'] == first]['ga_gender'].values[0]
            if gender == 'male':
                male +=1
            else:
                female +=1
        
    if male > female:
        genders.append(0)
    else:
        genders.append(1)
        

names = X['nume'].str.get_dummies(sep='-').add_prefix('nume_')
#names.head()

X = pd.concat([X, names], axis = 1)

#drop the old name column
X = X.drop('nume', axis=1)


In [4]:
#scale, normalize data so it can be used for several algorithms

number_columns = []
for col in X.columns:
    if X[col].dtype != 'object' and 'nume' not in col:
        number_columns.append(col)
        X[col] = X[col].astype(float)

#print number_columns

scaler = StandardScaler()

X_scale = X.copy()
X_scale[number_columns] = scaler.fit_transform(X[number_columns])
X_scale.head()


categorical_features = []
categorical_features_indices = []
for i,c in enumerate(X.columns):
        if X[c].dtype == 'object':
            categorical_features.append(c)
            categorical_features_indices.append(i)      
#print categorical_features

X_dummy = pd.get_dummies(X_scale, columns=categorical_features)

X_dummy['gender'] = genders


for i,c in enumerate(Y.columns):
        if Y[c].dtype == 'object':
            lbl = LabelEncoder()
            lbl.fit(list(Y[c].values))
            Y[c] = lbl.transform(list(Y[c].values)) 
Y.head()

Unnamed: 0,rezultatul final
0,0
1,0
2,0
3,0
4,0


In [5]:
#split
X_train, X_test, Y_train, Y_test = train_test_split(X_dummy, Y, test_size=0.20, random_state = 40) #same seed to compare results

In [6]:
#try neural network
#this takes so much time compared to random forest.....
from sklearn.neural_network import MLPClassifier  
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=100)  
mlp.fit(X_train, Y_train.values.ravel())  

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(10, 10, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [8]:
Y_pred = mlp.predict(X_test)
print accuracy_score(Y_pred, Y_test)

print confusion_matrix(Y_test, Y_pred)

print precision_score(Y_test, Y_pred, average='macro') 
print recall_score(Y_test, Y_pred, average='macro')

0.7923875432525952
[[ 505  201]
 [ 219 1098]]
0.7713897005346229
0.7745052172397995


In [13]:
model = RandomForestClassifier(n_jobs=-1, n_estimators=100)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print accuracy_score(Y_pred, Y_test)

print confusion_matrix(Y_test, Y_pred)

print precision_score(Y_test, Y_pred, average='macro') 
print recall_score(Y_test, Y_pred, average='macro')

0.8388531883341572
[[ 535  171]
 [ 155 1162]]
0.8235401241614752
0.8200493223288399


In [22]:
from sklearn.tree import export_graphviz
import os

export_graphviz(model.estimators_[0],
                out_file='tree.dot',
                feature_names=X_dummy.columns,
                filled=True,
                rounded=True)


os.system('dot -Tpng tree.dot -o tree.png')

1

In [None]:
from IPython.display import Image
Image(filename = 'tree_limited.png')