## Test "best of two" classifier 

This notebook test a classifier that operates in two layers:
- First we use a SVM classifier to label utterances with high degree of certainty.
- Afterwards we use heuristics to complete the labeling

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
import pickle
import matplotlib.pyplot as plt

root_path = os.path.dirname(os.path.abspath(os.getcwd()))
sys.path.append(root_path)

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from src import phase_classification as pc

data_path = os.path.join(root_path,'data')
tables_path = os.path.join(data_path,'tables')
results_path = os.path.join(root_path,'results')
output_path =os.path.join(results_path,'tables')

In [2]:
import importlib
importlib.reload(pc)

<module 'src.phase_classification' from 'C:\\Users\\CATALINA ESPINOZA\\Documents\\ciae\\Classification_IBL\\src\\phase_classification.py'>

In [3]:
WITH_STEMMING = True
#REMOVE_STOPWORDS = True
SEED = 10
NUM_TOPICS = 60
random.seed(SEED)
CLASS_W = False

In [4]:
test_i = '[test1]'
file_name = test_i+'IBL_topic_distribution_by_utterance_before_after_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'test',file_name))

In [5]:
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
for key in the_keys:
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],round(class_samples[key]*1.0/total_samples,2)))

181
key 1, samples: 55, prop: 0.3
key 2, samples: 27, prop: 0.15
key 3, samples: 46, prop: 0.25
key 4, samples: 7, prop: 0.04
key 5, samples: 46, prop: 0.25


In [6]:
filter_rows = list(range(180))+[187,188]
row_label = 180

In [7]:
dfs_all,_ = pc.split_df_discussions(df_data,.0,SEED)
X_all,y_all_1 = pc.get_joined_data_from_df(dfs_all,filter_rows,row_label)
len(y_all_1)

181

In [8]:
t = 0.55
name_classifier = 'classifier_svm_before_after_best_of_two_cw_{}.pickle'.format(CLASS_W)
output_first_layer_1 = pc.first_layer_classifier(X_all,t,name_classifier)
comparison = list(zip(output_first_layer_1,y_all_1))

In [9]:
df_data['first_layer'] = output_first_layer_1

In [10]:
second_layer_1 = pc.second_layer_classifier_max_border(X_all,df_data,name_classifier)

In [11]:
df_data['second_layer'] = second_layer_1
df_data.to_excel(os.path.join(output_path,'[second_layer]'+file_name))

In [12]:
second_layer_1.count(-1)

3

In [13]:
len(second_layer_1)

181

In [25]:
third_layer_1 = [v if v>0 else 5 for v in second_layer_1]

In [17]:
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_all_1, third_layer),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all_1, third_layer))
df

              precision    recall  f1-score   support

           1       0.33      0.25      0.29        55
           2       0.19      0.78      0.31        27
           3       0.00      0.00      0.00        46
           4       0.04      0.14      0.07         7
           5       0.50      0.07      0.12        46

   micro avg       0.22      0.22      0.22       181
   macro avg       0.21      0.25      0.16       181
weighted avg       0.26      0.22      0.17       181



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,14,41,0,0,0
Phase 2,6,21,0,0,0
Phase 3,10,19,0,17,0
Phase 4,0,3,0,1,3
Phase 5,12,26,0,5,3


### Test 2

In [18]:
test_i = '[test2]'
file_name = test_i+'IBL_topic_distribution_by_utterance_before_after_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'test',file_name))
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
for key in the_keys:
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],round(class_samples[key]*1.0/total_samples,2)))

100
key 1, samples: 17, prop: 0.17
key 2, samples: 6, prop: 0.06
key 3, samples: 24, prop: 0.24
key 4, samples: 1, prop: 0.01
key 5, samples: 52, prop: 0.52


In [19]:
dfs_all,_ = pc.split_df_discussions(df_data,.0,SEED)
X_all,y_all_2 = pc.get_joined_data_from_df(dfs_all,filter_rows,row_label)
output_first_layer_2 = pc.first_layer_classifier(X_all,t,name_classifier)
comparison = list(zip(output_first_layer_2,y_all_2))
df_data['first_layer'] = output_first_layer_2
second_layer_2 = pc.second_layer_classifier_max_border(X_all,df_data,name_classifier)
df_data['second_layer'] = second_layer_2
df_data.to_excel(os.path.join(output_path,'[second_layer]'+file_name))

In [24]:
third_layer_2 = [v if v>0 else 5 for v in second_layer_2]

In [21]:
second_layer_2.count(-1)

2

In [22]:
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_all_2, third_layer),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all_2, third_layer))
df

              precision    recall  f1-score   support

           1       0.50      0.76      0.60        17
           2       0.10      1.00      0.19         6
           3       0.00      0.00      0.00        24
           4       0.00      0.00      0.00         1
           5       1.00      0.04      0.07        52

   micro avg       0.21      0.21      0.21       100
   macro avg       0.32      0.36      0.17       100
weighted avg       0.61      0.21      0.15       100



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,13,4,0,0,0
Phase 2,0,6,0,0,0
Phase 3,5,19,0,0,0
Phase 4,0,1,0,0,0
Phase 5,8,28,0,14,2


In [26]:
y_all = y_all_1+y_all_2
pred = third_layer_1 + third_layer_2
df = pd.DataFrame(confusion_matrix(y_all, pred),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all, pred))
df

              precision    recall  f1-score   support

           1       0.40      0.38      0.39        72
           2       0.16      0.82      0.27        33
           3       0.00      0.00      0.00        70
           4       0.03      0.12      0.04         8
           5       0.62      0.05      0.09        98

   micro avg       0.21      0.21      0.21       281
   macro avg       0.24      0.27      0.16       281
weighted avg       0.34      0.21      0.16       281



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,27,45,0,0,0
Phase 2,6,27,0,0,0
Phase 3,15,38,0,17,0
Phase 4,0,4,0,1,3
Phase 5,20,54,0,19,5


In [27]:
print("Accuracy {0:.3f}".format(np.sum(confusion_matrix(y_all, pred).diagonal())/len(y_all)))
bs = [pc.unit_vector(x) for x in y_all]
y_pred = [pc.unit_vector(x) for x in pred]
np.sqrt(np.sum([np.square(y_pred[i]-bs[i]) for i in range(len(y_all))])/(len(y_all)*2))

Accuracy 0.214


0.8868353107129882