# Problem to solve

The objective is to use the data from handwritings to create a model that allows to predict if some of those handwritings came from a bipolar person or not.

The data include the target label, which makes it a supervised machine learning task.

In [10]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


In this part we will do the machine learning techniques 

In [2]:
data =  pd.read_parquet('../data/processed/data_processed.parquet')

In [4]:
data.head(3)

Unnamed: 0,VHD,VLV,V(mC),V(E),V(Sx),V(L),Men,Femal,"Age(0,0.5,1)","Label(0,1)"
0,5e-05,37807.614,8.144,0.005491,8.122097,245.843,1,0,0.0,0
1,5e-06,29080.23,3.5225,0.00095,0.251,173.653,1,0,0.0,0
2,7e-06,25078.01,4.3917,0.000735,1.5306,287.208,1,0,0.0,0


In [6]:
label = {'Label(0,1)'}
columns_set = set(data.columns.values)
x = data[list(columns_set-label)]
y = data[list(label)]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [9]:
model_params = {
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,50,100]
        }
    },
    'kneighbors_classifier':{
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy', 'log_loss']
        }
    }
}

In [11]:
scores = []

for model_name, mp in model_params.items():
    clf1 =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)
    print(f'---- training {model_name} ----')
    clf1.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf1.best_score_,
        'best_params': clf1.best_params_
        
    })
    
models_info_data_frame = pd.DataFrame(scores,columns=['model','best_score','best_params'])
models_info_data_frame

---- training random_forest ----


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


---- training kneighbors_classifier ----


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,model,best_score,best_params
0,random_forest,0.966667,{'n_estimators': 100}
1,kneighbors_classifier,1.0,{'criterion': 'gini'}


Resampling the Target column


In [12]:
count_df = data.groupby(['Label(0,1)'])['Label(0,1)'].count()

labels_unbalanced = [k for k, v in count_df.items() if v <= 100]

In [13]:
from sklearn.utils import resample


df_sampled =pd.DataFrame()
for j in labels_unbalanced:
    
    df_minority_j = data[data['Label(0,1)']==j]
    df_minority_upsampled = resample(df_minority_j, 
                                 replace=True,     
                                 n_samples=400,    
                                 stratify= df_minority_j,
                                 random_state=123)
    df_sampled = pd.concat([df_sampled, df_minority_upsampled])

In [14]:
data_train = pd.concat([data, df_sampled])

In [16]:
x2 = data_train[list(columns_set-label)]
y2 = data_train[list(label)]
X_train2, X_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.3, random_state=0)

In [17]:
scores = []

for model_name, mp in model_params.items():
    clf1 =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)
    print(f'---- training {model_name} ----')
    clf1.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf1.best_score_,
        'best_params': clf1.best_params_
        
    })
    
models_info_data_frame = pd.DataFrame(scores,columns=['model','best_score','best_params'])
models_info_data_frame

---- training random_forest ----


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


---- training kneighbors_classifier ----


Unnamed: 0,model,best_score,best_params
0,random_forest,0.966667,{'n_estimators': 50}
1,kneighbors_classifier,1.0,{'criterion': 'gini'}


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score, confusion_matrix

clf = RandomForestClassifier(n_estimators=50)
clf.fit(X_train2, y_train2)

y_pred = clf.predict(X_test2)
score = accuracy_score(y_test2, y_pred)
print('Accuracy: {}'.format(score))
print('Precision score: ', precision_score(y_test2, y_pred, average='micro'))
print('Recall score: ', recall_score(y_test2, y_pred, average='micro'))

Accuracy: 1.0
Precision score:  1.0
Recall score:  1.0


  clf.fit(X_train2, y_train2)


In [20]:
from sklearn import model_selection

kFold = model_selection.KFold(n_splits=10)
scoring = 'accuracy'
score = (model_selection.cross_val_score(clf, X_train, y_train,  scoring = scoring, cv = kFold))
print (f"( {score.mean()}, {score.std()} )")

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


( 1.0, 0.0 )


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
