## Test "best of two" classifier 

This notebook test a classifier that operates in two layers:
- First we use a SVM classifier to label utterances with high degree of certainty.
- Afterwards we use heuristics to complete the labeling

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import random
import pickle
import matplotlib.pyplot as plt

root_path = os.path.dirname(os.path.abspath(os.getcwd()))
sys.path.append(root_path)

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from src import phase_classification as pc

data_path = os.path.join(root_path,'data')
tables_path = os.path.join(data_path,'tables')
results_path = os.path.join(root_path,'results')
output_path =os.path.join(results_path,'tables')

In [2]:
import importlib
importlib.reload(pc)

<module 'src.phase_classification' from 'C:\\Users\\CATALINA ESPINOZA\\Documents\\ciae\\Classification_IBL\\src\\phase_classification.py'>

In [3]:
WITH_STEMMING = True
#REMOVE_STOPWORDS = True
SEED = 10
NUM_TOPICS = 60
random.seed(SEED)
t = 0

In [4]:
test_i = '[test1]'
file_name = test_i+'IBL_topic_distribution_by_utterance_with_phrase_before_and_after_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'test',file_name))

In [5]:
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
for key in the_keys:
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],round(class_samples[key]*1.0/total_samples,2)))

181
key 1, samples: 55, prop: 0.3
key 2, samples: 27, prop: 0.15
key 3, samples: 46, prop: 0.25
key 4, samples: 7, prop: 0.04
key 5, samples: 46, prop: 0.25


In [6]:
filter_rows = list(range(0,180))+[187,188]
filter_labels = [180]

In [7]:
df_data.head(2)

Unnamed: 0,Topic before 1,Topic before 2,Topic before 3,Topic before 4,Topic before 5,Topic before 6,Topic before 7,Topic before 8,Topic before 9,Topic before 10,...,Topic after 60,phase,phase_1,phase_2,phase_3,phase_4,phase_5,utterance,length utterance,utterance_relative_time
0,0.008418,0.011185,0.005599,0.005092,0.003249,0.004417,0.004143,0.007655,0.011196,0.013947,...,0.011693,5,0,0,0,0,1,paina suora,0.043478,0.0
1,0.008418,0.011185,0.005599,0.005092,0.003249,0.004417,0.004143,0.007655,0.011196,0.013947,...,0.007614,5,0,0,0,0,1,no emmä tiiä olis ihan,0.173913,0.005181


In [10]:
all_set = pc.split_df_test(df_data)
X_all,y_all = pc.get_data_from_dict(all_set,filter_rows)
len(y_all)
selected_features = [3,
 8,
 140,
 142,
 20,
 149,
 22,
 153,
 26,
 29,
 159,
 33,
 163,
 36,
 39,
 170,
 43,
 175,
 181,
 63,
 80,
 82,
 86,
 89,
 93,
 99,
 103,
 110,
 115]

In [11]:
X_all = [element[selected_features] for element in X_all] 

In [12]:
name_classifier = 'classifier_svm_linear_combination_svc_ba_selected.pickle'
with open(os.path.join(data_path,name_classifier),'rb') as f:
    svc = pickle.load(f)
    coeff = pickle.load(f)
output_first_layer = pc.first_layer_classifier(X_all,t,svc)
comparison = list(zip(output_first_layer,y_all))

In [13]:
df_data['first_layer'] = output_first_layer

In [14]:
second_layer = pc.second_layer_combination_test(X_all,coeff,svc)

In [15]:
second_layer.count(-1)

0

In [16]:
df_data['second_layer'] = second_layer
#df_data.to_excel(os.path.join(output_path,'[second_layer]'+file_name))

In [17]:
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_all, second_layer),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all, second_layer))
df

              precision    recall  f1-score   support

           1       0.35      0.33      0.34        55
           2       1.00      0.26      0.41        27
           3       0.50      0.37      0.42        46
           4       0.00      0.00      0.00         7
           5       0.28      0.54      0.37        46

   micro avg       0.37      0.37      0.37       181
   macro avg       0.43      0.30      0.31       181
weighted avg       0.45      0.37      0.37       181



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,18,0,11,0,26
Phase 2,7,7,2,0,11
Phase 3,10,0,17,0,19
Phase 4,0,0,0,0,7
Phase 5,17,0,4,0,25


### Test 2

In [18]:
test_i = '[test2]'
file_name = test_i+'IBL_topic_distribution_by_utterance_with_phrase_before_and_after_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'test',file_name))
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
for key in the_keys:
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],round(class_samples[key]*1.0/total_samples,2)))

100
key 1, samples: 17, prop: 0.17
key 2, samples: 6, prop: 0.06
key 3, samples: 24, prop: 0.24
key 4, samples: 1, prop: 0.01
key 5, samples: 52, prop: 0.52


In [19]:
all_set = pc.split_df_test(df_data)
X_all,y_all = pc.get_data_from_dict(all_set,filter_rows)
X_all = [element[selected_features] for element in X_all] 
output_first_layer = pc.first_layer_classifier(X_all,t,name_classifier)
comparison = list(zip(output_first_layer,y_all))
df_data['first_layer'] = output_first_layer
second_layer = pc.second_layer_combination_test(X_all,coeff,svc)
df_data['second_layer'] = second_layer
#df_data.to_excel(os.path.join(output_path,'[second_layer]'+file_name))

In [20]:
second_layer.count(-1)

0

In [21]:
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_all, second_layer),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all, second_layer))
df

              precision    recall  f1-score   support

           1       0.42      0.76      0.54        17
           2       1.00      0.33      0.50         6
           3       0.58      0.46      0.51        24
           4       0.00      0.00      0.00         1
           5       0.67      0.62      0.64        52

   micro avg       0.58      0.58      0.58       100
   macro avg       0.53      0.43      0.44       100
weighted avg       0.62      0.58      0.58       100



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,13,0,0,0,4
Phase 2,0,2,0,0,4
Phase 3,6,0,11,0,7
Phase 4,0,0,0,0,1
Phase 5,12,0,8,0,32


In [22]:
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svc.score(X_all, y_all)))

Accuracy of SVM classifier on training set: 0.55
