In [1]:
import sys
import os
from pprint import pprint
sys.path.append(os.path.abspath("../../../"))
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
from IPython.display import Markdown
import ipywidgets as iw
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
import usure.common.logging as usurelogging
from usure.config import config
from usure.classification.infrastructure import (
     BasicSentenceCleaner 
    ,FileLabeledCommentsDao
    ,FileWordVectorsRep
    ,CsvLabeledCommentsDao)
from usure.classification.core import SvmLab, ClassifierInput, LabReport, WordVectorsService, LabeledComments
from usure.classification.infrastructure import FileModelDao
import usure.classification.ui.utils as ui

Using TensorFlow backend.


In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = lines => false

<IPython.core.display.Javascript object>

In [4]:
pd.set_option('colheader_justify', 'left')
pd.set_option('display.max_colwidth', 200)

In [5]:
cleaner = BasicSentenceCleaner(config.assets)
comments_dao = CsvLabeledCommentsDao(config.sets, cleaner)
wv_rep = FileWordVectorsRep(config.embeddings)
model_dao = FileModelDao(config.models)
lab = SvmLab(model_dao)
comments = comments_dao.get("train.csv")
test_comments = comments_dao.get("test.csv")
test_inputs = []

In [6]:
def classification_lab_report(word_vector_no, input, lab):
    display(Markdown(f'## {word_vector_no}. Creating models with embeddings: {input.embeddings_name}'))
    lab_report = lab.train_by_stratifiedkfold(input, folds=10)
    return lab_report

def model_architecture_summary(lab_report):
    display(Markdown(f'### Mean Accuracies and losses'))
    display(lab_report.summary)
    display(Markdown(f'### Accuracies and losses by Stratified KFold (folds with balanced class distributions)'))
    model_reports_df = ui.model_reports_to_DataFrame(lab_report.model_reports)
    display(model_reports_df) 
    
def show_statistics_by_model(word_vector_no, word_vector_name, lab_report):
    """Shows statistics by models created by every folds"""
    display(Markdown(f'## {word_vector_no+1}. {word_vector_name}'))
    for fold_no, model_report in enumerate(lab_report.model_reports):
        display(Markdown(f'### Fold number: {fold_no+1}. Name: {model_report.name}'))
        ui.plot_learning_curves_if_epoches(model_report)
        display(Markdown(f'### Training'))
        print(model_report.training.classification_report)
        confu_mtx = model_report.training.confusion_matrix
        display(confu_mtx)
        display(Markdown(f'### Validation'))
        print(model_report.validation.classification_report)
        confu_mtx = model_report.validation.confusion_matrix
        display(confu_mtx)
        display(Markdown(f'### Test'))
        test_metrics, labeled_predictions = lab.test(model_report.name, test_inputs[word_vector_no])
        print(f"Accuracy: {test_metrics.accuracy}")
        print(f"Loss: {test_metrics.loss}")
        print(test_metrics.classification_report)
        display(test_metrics.confusion_matrix)
        test_labeled_comments = test_inputs[word_vector_no].labeled_comments
        labeled_comments = LabeledComments(f"{model_report.name}-{word_vector_name}.cnn.csv",test_labeled_comments.comments ,labeled_predictions)
        comments_dao.save_from_origin(labeled_comments, "test.csv")

In [None]:
#%matplotlib notebook
display(Markdown(f'# Suport Vector Machine'))
word_vectors = wv_rep.get_all()
word_vector_names = []
lab_reports = []

#word_vectors = [word_vectors.__next__()] #comment!!!!!
for i, word_vector in enumerate(word_vectors):
    wv_service = WordVectorsService(word_vector)
    test_inputs.append(ClassifierInput(test_comments, wv_service))
    input = ClassifierInput(comments, wv_service)
    lab_report = classification_lab_report(i+1, input, lab)
    model_architecture_summary(lab_report)
    word_vector_names.append(word_vector.name)
    lab_reports.append(lab_report)
    
for i, word_vector_name in enumerate(word_vector_names): 
    lab_report = lab_reports[i]
    show_statistics_by_model(i, word_vector_name, lab_report)
    

# Suport Vector Machine

## 1. Creating models with embeddings: CorpusFBCR2013.txt.usu.bw.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.603105,0.009009,0.506229,0.033513


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,fa4cc2280f954e2ca6e59770f0abafae,0.58874,0.978398,0.5,1.164455
1,584b91c288d64882b361cdaddcaf3fb4,0.616412,0.946215,0.483051,1.178563
2,edfda12dfc95442b90ecd515d318e628,0.601145,0.964906,0.483051,1.157143
3,792f8d6d27c143ae9491a94620950148,0.590086,0.996671,0.521368,1.126805
4,0da52d5a736e465fa37d479ba5eb7425,0.600572,0.963307,0.495726,1.179918
5,1151129676c14511885c985bc8605465,0.612965,0.946937,0.504274,1.229007
6,665bb1e775914778995e6d1f973e93b8,0.611429,0.960011,0.448276,1.195221
7,622bf0f6e4514e259205015c0d28efeb,0.60381,0.981805,0.517241,1.150636
8,5c083f198508433aadbcd2c601cf0276,0.603235,0.976069,0.53913,1.165174
9,11b8b3ddf93949a1af0067362b4cb082,0.602662,0.970335,0.570175,1.13616


## 2. Creating models with embeddings: CorpusFBCR2013.txt.usu.sw.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.615684,0.008415,0.505805,0.042922


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,eac95f700623407ba1fefc85d838613c,0.604962,0.987332,0.533898,1.157721
1,e5498edcb4574de7b2fd550f79841779,0.620229,0.990112,0.550847,1.102816
2,c6fcef528a104b818dc1fded965aed9c,0.624046,0.958837,0.525424,1.200535
3,a38357ddfc534cb5a407eb84b895ac47,0.609152,0.979428,0.538462,1.098635
4,b5f2cfd8481341e986aa95447c038a53,0.619638,0.965965,0.452991,1.20966
5,9e6559167dbb4133beb38c0d01d26315,0.618684,0.95569,0.444444,1.231778
6,fc52c69a26514810bc5880b685a2c0b2,0.618095,0.957086,0.551724,1.164691
7,a4c705c5f1f0474f8683c3053b16f388,0.599048,0.989437,0.517241,1.106289
8,5374d849b0164e24aa4b9c89265feb39,0.624167,0.975924,0.495652,1.196772
9,39523724bf304ea8877ccbbf8496c046,0.618821,0.951282,0.447368,1.260744


## 3. Creating models with embeddings: tweets.txt.usu.bw.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.624733,0.010621,0.521965,0.042149


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,4156a60ce6854863b4588c31deffde7b,0.622137,0.883402,0.525424,1.167512
1,357adf25c646404ea7a2d90ac31670cf,0.612595,0.918894,0.567797,1.074952
2,263011b2b1fe446f91837633b3776258,0.616412,0.918886,0.584746,1.036477
3,90cf2624711744e1a591b8879a54f6c5,0.621544,0.902573,0.521368,1.065163
4,9f13bf14cb77489fbf6c054bd0960d92,0.632031,0.91906,0.521368,1.096087
5,0497960fb2074b2898cddd93a51dffe3,0.64061,0.895708,0.504274,1.196235
6,23db7007648049b2bdba94c8cef2265c,0.612381,0.919116,0.560345,1.057737
7,f76a6f87251e4776b2be11237bab96f8,0.620952,0.905222,0.517241,1.132765
8,84a60285401f43d1a1520414d977aec1,0.642245,0.884041,0.452174,1.192187
9,20f5c266b4e2467296f42b9effdd795c,0.626426,0.889955,0.464912,1.169194


## 4. Creating models with embeddings: tweets.txt.usu.sw.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.637795,0.005874,0.535855,0.042696


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,ecd92b52b8154e298705409c0aa15796,0.647901,0.861294,0.474576,1.184344
1,6cf9e690f14240009ffab7d8a368f4e2,0.633588,0.902204,0.584746,1.042084
2,e75e703637224a99a7fa63fc9496c25f,0.63645,0.901472,0.542373,1.05408
3,c2adff136cf84c9ba66f026f39126747,0.632031,0.90534,0.598291,1.024706
4,8fe2248b3a0545049a6534b5e7ed4ec3,0.63775,0.903306,0.547009,1.088542
5,8ad733ad936a46d7acdbbad822a7bc9b,0.645377,0.89566,0.57265,1.103381
6,36c5e28c0c284abdbfcd426992cc45e7,0.63619,0.882567,0.534483,1.081609
7,cad17b08216649779dff5f2a8d2d5caa,0.641905,0.868014,0.482759,1.224046
8,0ba454b191674055a2fc9c3eb4f0415a,0.628925,0.900539,0.530435,1.124314
9,c93ce1a4bdc848f29642026ca978d6b9,0.637833,0.874982,0.491228,1.155178


## 5. Creating models with embeddings: SBW-vectors-300-min5.bin

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.59291,0.005929,0.514834,0.034616


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,de840a1e673140e289c9b9375177d01d,0.594466,0.989602,0.449153,1.202188
1,ddf18b23637a4500a67a092f16ed950e,0.594466,0.992424,0.508475,1.156528
2,b0c3058a7856480eb65d1f9c752562a3,0.589695,0.975242,0.483051,1.205039
3,51b73e1b4edd4621a0cf4af2eb04eabe,0.584366,0.978328,0.529915,1.182302
4,6d22fda6ed1b426682e41844c4305de7,0.605338,0.969633,0.487179,1.192994
5,77ccbb9c63e24824a75a7835f53ef4fd,0.594852,0.981616,0.547009,1.155183
6,6bc1889b7c4742c7adb2f81145b71683,0.588571,1.001049,0.517241,1.136857
7,ad462d32d90041e4a64ee399ec6b9f80,0.591429,0.975688,0.517241,1.182919
8,c8886ad64a8f417b9e857637ac789ce8,0.588011,0.992007,0.565217,1.106237
9,195e913e1fbd4d9387915c821366836a,0.597909,0.989871,0.54386,1.095989


## 1. CorpusFBCR2013.txt.usu.bw.kvs

### Fold number: 1. Name: fa4cc2280f954e2ca6e59770f0abafae

### Training

              precision    recall  f1-score   support

           N       0.52      0.94      0.67       410
         NEU       1.00      0.06      0.11       134
        NONE       0.84      0.21      0.34       200
           P       0.71      0.59      0.65       304

   micro avg       0.59      0.59      0.59      1048
   macro avg       0.77      0.45      0.44      1048
weighted avg       0.70      0.59      0.53      1048



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,386,97,133,121
NEU,0,8,0,0
NONE,0,5,43,3
P,24,24,24,180


### Validation

              precision    recall  f1-score   support

           N       0.49      0.87      0.63        46
         NEU       0.00      0.00      0.00        15
        NONE       0.60      0.13      0.21        23
           P       0.50      0.47      0.48        34

   micro avg       0.50      0.50      0.50       118
   macro avg       0.40      0.37      0.33       118
weighted avg       0.45      0.50      0.43       118



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,40,8,16,17
NEU,0,0,0,0
NONE,1,0,3,1
P,5,7,4,16


### Test

Accuracy: 0.4995715509854327
Loss: 1.1807735300142308
              precision    recall  f1-score   support

           N       0.47      0.90      0.62       456
         NEU       0.00      0.00      0.00       148
        NONE       0.49      0.08      0.14       224
           P       0.60      0.46      0.52       339

   micro avg       0.50      0.50      0.50      1167
   macro avg       0.39      0.36      0.32      1167
weighted avg       0.45      0.50      0.42      1167



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,409,116,172,174
NEU,0,0,1,0
NONE,6,4,18,9
P,41,28,33,156


### Fold number: 2. Name: 584b91c288d64882b361cdaddcaf3fb4

### Training

              precision    recall  f1-score   support

           N       0.55      0.94      0.70       410
         NEU       1.00      0.05      0.10       134
        NONE       0.79      0.26      0.39       200
           P       0.72      0.67      0.69       304

   micro avg       0.62      0.62      0.62      1048
   macro avg       0.77      0.48      0.47      1048
weighted avg       0.70      0.62      0.56      1048



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,384,91,123,95
NEU,0,7,0,0
NONE,2,6,52,6
P,24,30,25,203


### Validation

              precision    recall  f1-score   support

           N       0.47      0.98      0.63        46
         NEU       0.00      0.00      0.00        15
        NONE       0.00      0.00      0.00        23
           P       0.63      0.35      0.45        34

   micro avg       0.48      0.48      0.48       118
   macro avg       0.28      0.33      0.27       118
weighted avg       0.36      0.48      0.38       118



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,45,12,18,21
NEU,0,0,0,0
NONE,0,2,0,1
P,1,1,5,12


### Test

Accuracy: 0.49271636675235647
Loss: 1.180334696579936
              precision    recall  f1-score   support

           N       0.47      0.87      0.61       456
         NEU       0.00      0.00      0.00       148
        NONE       0.39      0.08      0.13       224
           P       0.58      0.47      0.52       339

   micro avg       0.49      0.49      0.49      1167
   macro avg       0.36      0.36      0.32      1167
weighted avg       0.43      0.49      0.42      1167



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,398,111,171,167
NEU,1,0,0,0
NONE,9,6,18,13
P,48,31,35,159


### Fold number: 3. Name: edfda12dfc95442b90ecd515d318e628

### Training

              precision    recall  f1-score   support

           N       0.54      0.94      0.69       410
         NEU       1.00      0.07      0.14       134
        NONE       0.84      0.24      0.37       200
           P       0.70      0.61      0.65       304

   micro avg       0.60      0.60      0.60      1048
   macro avg       0.77      0.47      0.46      1048
weighted avg       0.70      0.60      0.55      1048



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,386,89,127,114
NEU,0,10,0,0
NONE,0,5,48,4
P,24,30,25,186


### Validation

              precision    recall  f1-score   support

           N       0.49      0.78      0.61        46
         NEU       0.00      0.00      0.00        15
        NONE       0.50      0.17      0.26        23
           P       0.47      0.50      0.49        34

   micro avg       0.48      0.48      0.48       118
   macro avg       0.37      0.36      0.34       118
weighted avg       0.43      0.48      0.43       118



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,36,7,15,15
NEU,1,0,0,0
NONE,0,2,4,2
P,9,6,4,17


### Test

Accuracy: 0.493573264781491
Loss: 1.1779778296141825
              precision    recall  f1-score   support

           N       0.47      0.87      0.61       456
         NEU       0.50      0.01      0.01       148
        NONE       0.40      0.08      0.13       224
           P       0.58      0.47      0.52       339

   micro avg       0.49      0.49      0.49      1167
   macro avg       0.49      0.36      0.32      1167
weighted avg       0.49      0.49      0.42      1167



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,397,113,169,166
NEU,1,1,0,0
NONE,7,7,17,12
P,51,27,38,161


### Fold number: 4. Name: 792f8d6d27c143ae9491a94620950148

### Training

              precision    recall  f1-score   support

           N       0.53      0.95      0.68       410
         NEU       1.00      0.06      0.11       134
        NONE       0.80      0.22      0.34       201
           P       0.71      0.58      0.64       304

   micro avg       0.59      0.59      0.59      1049
   macro avg       0.76      0.45      0.44      1049
weighted avg       0.69      0.59      0.53      1049



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,390,92,134,122
NEU,0,8,0,0
NONE,1,5,44,5
P,19,29,23,177


### Validation

              precision    recall  f1-score   support

           N       0.49      0.78      0.61        46
         NEU       0.00      0.00      0.00        15
        NONE       0.50      0.14      0.21        22
           P       0.59      0.65      0.62        34

   micro avg       0.52      0.52      0.52       117
   macro avg       0.40      0.39      0.36       117
weighted avg       0.46      0.52      0.46       117



Unnamed: 0_level_0,N,NEU,NONE,P
Pred/True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
N,36,11,16,10
NEU,0,0,1,0
NONE,1,0,3,2
P,9,4,2,22


### Test

Accuracy: 0.4892887746358183
Loss: 1.1800753423150212
              precision    recall  f1-score   support

           N       0.47      0.87      0.61       456
         NEU       0.00      0.00      0.00       148
        NONE       0.47      0.09      0.15       224
           P       0.57      0.45      0.50       339

   micro avg       0.49      0.49      0.49      1167
   macro avg       0.38      0.35      0.32      1167
weighted avg       0.44      0.49      0.41      1167

