## Train classifier by layers

This notebook trains a classifier that operates in two layers:
- First we use a SVM classifier to label utterances with high degree of certainty.
- Afterwards we use heuristics to complete the labeling

### Import and path definition

In [2]:
import os
import pandas as pd
import numpy as np
import random
import pickle
import sys
import matplotlib.pyplot as plt

root_path = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
sys.path.append(root_path)

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from src import phase_classification as pc

data_path = os.path.join(root_path,'data')
tables_path = os.path.join(data_path,'tables')

### Load data

In [3]:
WITH_STEMMING = True
#REMOVE_STOPWORDS = True
SEED = 10
NUM_TOPICS = 60
random.seed(SEED)

In [4]:
file_name = '[train]IBL_topic_distribution_by_utterance_minimum_5_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'train',file_name))

### Data description

In [8]:
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
proportions = []
for key in the_keys:
    proportion = round(class_samples[key]*1.0/total_samples,2)
    proportions.append(proportion)
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],proportion))

1600
key 1, samples: 403, prop: 0.25
key 2, samples: 175, prop: 0.11
key 3, samples: 406, prop: 0.25
key 4, samples: 62, prop: 0.04
key 5, samples: 554, prop: 0.35


### Split data

In [9]:
filter_rows = list(range(60))+[67,68]
row_label = 60
dfs_all,_ = pc.split_df_discussions(df_data,.0)

### Classify dummy

In [10]:
dfs_all,_ = pc.split_df_discussions(df_data,.0,SEED)
X_all,y_all = pc.get_joined_data_from_df(dfs_all,filter_rows,row_label)
np.max(proportions)

0.35

In [11]:
phase = np.argmax(proportions)+1
pred = [phase for i in y_all]
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_all, pred),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
#print(" ")
print(classification_report(y_all, pred))
df

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       403
           2       0.00      0.00      0.00       175
           3       0.00      0.00      0.00       406
           4       0.00      0.00      0.00        62
           5       0.35      1.00      0.51       554

   micro avg       0.35      0.35      0.35      1600
   macro avg       0.07      0.20      0.10      1600
weighted avg       0.12      0.35      0.18      1600



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,0,0,0,0,403
Phase 2,0,0,0,0,175
Phase 3,0,0,0,0,406
Phase 4,0,0,0,0,62
Phase 5,0,0,0,0,554


In [18]:
print(classification_report(y_all, pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       403
           2       0.00      0.00      0.00       175
           3       0.00      0.00      0.00       406
           4       0.00      0.00      0.00        62
           5       0.35      1.00      0.51       554

   micro avg       0.35      0.35      0.35      1600
   macro avg       0.07      0.20      0.10      1600
weighted avg       0.12      0.35      0.18      1600



  'precision', 'predicted', average, warn_for)


In [20]:
bs = [pc.unit_vector(x) for x in y_all]
y_pred = [pc.unit_vector(x) for x in pred]

In [21]:
np.sqrt(np.sum([np.square(y_pred[i]-bs[i]) for i in range(len(y_all))])/(len(y_all)*2))

0.8085480814398115

### Find threshold to split two mayor proportions

In [22]:
X_all,y_all = pc.get_data_from_list_df(dfs_all,filter_rows,row_label)

In [23]:
import importlib 
importlib.reload(pc)

<module 'src.phase_classification' from 'C:\\Users\\CATALINA ESPINOZA\\Documents\\ciae\\Classification_IBL\\src\\phase_classification.py'>

In [24]:
argsort_prop = np.argsort(proportions)

In [25]:
accuracy_try = []
steps = np.arange(0,1.0,0.025)
for step in steps:
    accuracy_try_aux = []
    for i in range(len(y_all)):
        test_prediction = []
        for j in range(len(y_all[i])):
            step_j = j*1.0/len(y_all[i])
            if step_j > step:
                test_prediction.append(argsort_prop[-1]+1)
            else:
                test_prediction.append(argsort_prop[-2]+1)
        accuracy_try_aux.append(np.sum(confusion_matrix(y_all[i], test_prediction).diagonal())/len(y_all[i]))
    accuracy_try.append(np.mean(accuracy_try_aux))

In [26]:
print(np.argmax(accuracy_try),np.max(accuracy_try),steps[17])

17 0.3599080476362191 0.42500000000000004


In [27]:
step = steps[17]
y_alls = []
pred = []
for i in range(len(y_all)):
    test_prediction = []
    for j in range(len(y_all[i])):
        y_alls.append(y_all[i][j])    
        step_j = j*1.0/len(y_all[i])
        if step_j > step:
            pred.append(argsort_prop[-1]+1)
        else:
            pred.append(argsort_prop[-2]+1)

In [28]:
[pred[i]==y_alls[i] for i in range(len(y_alls))].count(True)

585

In [29]:
585/len(y_alls)

0.365625

In [30]:
bs = [pc.unit_vector(x) for x in y_alls]
y_pred = [pc.unit_vector(x) for x in pred]

In [31]:
np.sqrt(np.sum([np.square(y_pred[i]-bs[i]) for i in range(len(y_alls))])/(len(y_alls)*2))

0.7964766161036996

In [32]:
step_1 = step

### Find threshold to split 1 and 5

In [33]:
accuracy_try = []
steps = np.arange(0,1.0,0.025)
for step in steps:
    accuracy_try_aux = []
    for i in range(len(y_all)):
        test_prediction = []
        for j in range(len(y_all[i])):
            step_j = j*1.0/len(y_all[i])
            if step_j > step:
                test_prediction.append(argsort_prop[-1]+1)
            else:
                test_prediction.append(1)
        accuracy_try_aux.append(np.sum(confusion_matrix(y_all[i], test_prediction).diagonal())/len(y_all[i]))
    accuracy_try.append(np.mean(accuracy_try_aux))

In [34]:
print(np.argmax(accuracy_try),np.max(accuracy_try),steps[15])

15 0.4450968597394787 0.375


In [35]:
step = steps[15]
y_alls = []
pred = []
for i in range(len(y_all)):
    test_prediction = []
    for j in range(len(y_all[i])):
        y_alls.append(y_all[i][j])    
        step_j = j*1.0/len(y_all[i])
        if step_j > step:
            pred.append(argsort_prop[-1]+1)
        else:
            pred.append(1)

In [36]:
bs = [pc.unit_vector(x) for x in y_alls]
y_pred = [pc.unit_vector(x) for x in pred]

In [37]:
np.sqrt(np.sum([np.square(y_pred[i]-bs[i]) for i in range(len(y_alls))])/(len(y_alls)*2))

0.7454025757937788

In [38]:
step_2 = step

In [39]:
with open(os.path.join(data_path,'random_training.pickle'),'wb') as f:
    pickle.dump(proportions,f)
    pickle.dump(step_1,f)
    pickle.dump(step_2,f)    