## Test "best of two" classifier 

This notebook test a classifier that operates in two layers:
- First we use a SVM classifier to label utterances with high degree of certainty.
- Afterwards we use heuristics to complete the labeling

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
import pickle
import matplotlib.pyplot as plt

root_path = os.path.dirname(os.path.abspath(os.getcwd()))
sys.path.append(root_path)

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from src import phase_classification as pc

data_path = os.path.join(root_path,'data')
tables_path = os.path.join(data_path,'tables')
results_path = os.path.join(root_path,'results')
output_path =os.path.join(results_path,'tables')

In [2]:
import importlib
importlib.reload(pc)

<module 'src.phase_classification' from 'C:\\Users\\CATALINA ESPINOZA\\Documents\\ciae\\Classification_IBL\\src\\phase_classification.py'>

In [3]:
WITH_STEMMING = True
#REMOVE_STOPWORDS = True
SEED = 10
NUM_TOPICS = 60
random.seed(SEED)
t = 0
CLASS_W = False

In [5]:
test_i = '[test1]'
file_name = test_i+'IBL_topic_distribution_by_utterance_before_after_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'test','before_after',file_name))

In [6]:
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
for key in the_keys:
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],round(class_samples[key]*1.0/total_samples,2)))

181
key 1, samples: 55, prop: 0.3
key 2, samples: 27, prop: 0.15
key 3, samples: 46, prop: 0.25
key 4, samples: 7, prop: 0.04
key 5, samples: 46, prop: 0.25


In [7]:
filter_rows = list(range(180))+[187,188]
row_label = 180

In [8]:
df_data.head(2)

Unnamed: 0,Topic before 1,Topic before 2,Topic before 3,Topic before 4,Topic before 5,Topic before 6,Topic before 7,Topic before 8,Topic before 9,Topic before 10,...,Topic after 60,phase,phase_1,phase_2,phase_3,phase_4,phase_5,utterance,length utterance,utterance_relative_time
0,0.026732,0.012012,0.005627,0.058408,0.004902,0.05742,0.006782,0.004468,0.006344,0.007537,...,0.007149,5,0,0,0,0,1,paina suora,0.043478,0.0
1,0.026732,0.012012,0.005627,0.058342,0.004902,0.057418,0.006782,0.004468,0.006344,0.007537,...,0.005513,5,0,0,0,0,1,no emmä tiiä olis ihan,0.173913,0.005181


In [9]:
dfs_all,_ = pc.split_df_discussions(df_data,.0,SEED)
X_all,y_all_1 = pc.get_joined_data_from_df(dfs_all,filter_rows,row_label)

In [10]:
CLASS_W

False

In [11]:
name_classifier = 'classifier_svm_linear_combination_svc_ba_cw_{}.pickle'.format(CLASS_W)
with open(os.path.join(data_path,name_classifier),'rb') as f:
    svc = pickle.load(f)
    coeff = pickle.load(f)
    t = pickle.load(f)
#t = 0.59
output_first_layer_1 = pc.first_layer_classifier(X_all,t,svc)
comparison = list(zip(output_first_layer_1,y_all_1))

In [12]:
df_data['first_layer'] = output_first_layer_1

In [13]:
second_layer_1 = pc.second_layer_combination_test(X_all,coeff,svc)

In [14]:
second_layer_1.count(-1)

0

In [15]:
df_data['second_layer'] = second_layer_1
df_data.to_excel(os.path.join(output_path,'[second_layer]'+file_name))

In [16]:
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_all_1, second_layer_1),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all_1, second_layer_1))
df

              precision    recall  f1-score   support

           1       0.28      0.25      0.27        55
           2       0.64      0.26      0.37        27
           3       0.53      0.35      0.42        46
           4       0.00      0.00      0.00         7
           5       0.31      0.61      0.41        46

   micro avg       0.36      0.36      0.36       181
   macro avg       0.35      0.29      0.29       181
weighted avg       0.39      0.36      0.35       181



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,14,0,11,0,30
Phase 2,8,7,2,0,10
Phase 3,11,1,16,0,18
Phase 4,0,3,0,0,4
Phase 5,17,0,1,0,28


In [17]:
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svc.score(X_all, y_all_1)))

Accuracy of SVM classifier on training set: 0.30


### Test 2

In [18]:
test_i = '[test2]'
file_name = test_i+'IBL_topic_distribution_by_utterance_before_after_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'test','before_after',file_name))
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
for key in the_keys:
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],round(class_samples[key]*1.0/total_samples,2)))

100
key 1, samples: 17, prop: 0.17
key 2, samples: 6, prop: 0.06
key 3, samples: 24, prop: 0.24
key 4, samples: 1, prop: 0.01
key 5, samples: 52, prop: 0.52


In [19]:
dfs_all,_ = pc.split_df_discussions(df_data,.0,SEED)
X_all,y_all_2 = pc.get_joined_data_from_df(dfs_all,filter_rows,row_label)
output_first_layer_2 = pc.first_layer_classifier(X_all,t,name_classifier)
comparison = list(zip(output_first_layer_2,y_all_2))
df_data['first_layer'] = output_first_layer_2
second_layer_2 = pc.second_layer_combination_test(X_all,coeff,svc)
df_data['second_layer'] = second_layer_2
df_data.to_excel(os.path.join(output_path,'[second_layer]'+file_name))

In [20]:
second_layer_2.count(-1)

0

In [21]:
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_all_2, second_layer_2),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all_2, second_layer_2))
df

              precision    recall  f1-score   support

           1       0.48      0.82      0.61        17
           2       0.67      0.33      0.44         6
           3       0.73      0.67      0.70        24
           4       0.00      0.00      0.00         1
           5       0.74      0.65      0.69        52

   micro avg       0.66      0.66      0.66       100
   macro avg       0.52      0.50      0.49       100
weighted avg       0.68      0.66      0.66       100



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,14,0,0,0,3
Phase 2,0,2,0,0,4
Phase 3,4,0,16,0,4
Phase 4,0,0,0,0,1
Phase 5,11,1,6,0,34


In [22]:
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svc.score(X_all, y_all_2)))

Accuracy of SVM classifier on training set: 0.63


In [23]:
y_all = y_all_1+y_all_2
pred = second_layer_1 + second_layer_2

In [24]:
df = pd.DataFrame(confusion_matrix(y_all, pred),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all, pred))
df

              precision    recall  f1-score   support

           1       0.35      0.39      0.37        72
           2       0.64      0.27      0.38        33
           3       0.62      0.46      0.52        70
           4       0.00      0.00      0.00         8
           5       0.46      0.63      0.53        98

   micro avg       0.47      0.47      0.47       281
   macro avg       0.41      0.35      0.36       281
weighted avg       0.48      0.47      0.46       281



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,28,0,11,0,33
Phase 2,8,9,2,0,14
Phase 3,15,1,32,0,22
Phase 4,0,3,0,0,5
Phase 5,28,1,7,0,62


In [25]:
print("Accuracy {0:.3f}".format(np.sum(confusion_matrix(y_all, pred).diagonal())/len(y_all)))
bs = [pc.unit_vector(x) for x in y_all]
y_pred = [pc.unit_vector(x) for x in pred]
np.sqrt(np.sum([np.square(y_pred[i]-bs[i]) for i in range(len(y_all))])/(len(y_all)*2))

Accuracy 0.466


0.7306215362152245