In [1]:
import sys
import os
from pprint import pprint
sys.path.append(os.path.abspath("../../../"))
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
from IPython.display import Markdown
import ipywidgets as iw
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
import usure.common.logging as usurelogging
from usure.config import config
from usure.classification.infrastructure import (
     BasicSentenceCleaner 
    ,CsvLabeledCommentsDao
    ,FileWordVectorsRep
    ,FileModelDao)
from usure.classification.core import CnnLab, ClassifierInput, LabReport, WordVectorsService, LabeledComments
import usure.classification.ui.utils as ui

Using TensorFlow backend.


In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = lines => false

<IPython.core.display.Javascript object>

In [4]:
pd.set_option('colheader_justify', 'left')
pd.set_option('display.max_colwidth', 200)

In [5]:
cleaner = BasicSentenceCleaner(config.assets)
comments_dao = CsvLabeledCommentsDao(config.sets, cleaner)
wv_rep = FileWordVectorsRep(config.embeddings)
model_dao = FileModelDao(config.models)
lab = CnnLab(model_dao)
comments = comments_dao.get("train.csv")
test_comments = comments_dao.get("test.csv")
test_inputs = []

In [6]:
def classification_lab_report(word_vector_no, input, lab):
    display(Markdown(f'## {word_vector_no}. Creating models with embeddings: {input.embeddings_name}'))
    lab_report = lab.train_by_stratifiedkfold(input, folds=10)
    return lab_report

def model_architecture_summary(lab_report):
    display(Markdown(f'### Mean Accuracies and losses'))
    display(lab_report.summary)
    display(Markdown(f'### Accuracies and losses by Stratified KFold (folds with balanced class distributions)'))
    model_reports_df = ui.model_reports_to_DataFrame(lab_report.model_reports)
    display(model_reports_df) 
    
def show_statistics_by_model(word_vector_no, word_vector_name, lab_report):
    """Shows statistics by models created by every folds"""
    display(Markdown(f'## {word_vector_no+1}. {word_vector_name}'))
    for fold_no, model_report in enumerate(lab_report.model_reports):
        display(Markdown(f'### Fold number: {fold_no+1}. Name: {model_report.name}'))
        ui.plot_learning_curves_if_epoches(model_report)
        display(Markdown(f'### Training'))
        print(model_report.training.classification_report)
        confu_mtx = model_report.training.confusion_matrix
        display(confu_mtx)
        display(Markdown(f'### Validation'))
        print(model_report.validation.classification_report)
        confu_mtx = model_report.validation.confusion_matrix
        display(confu_mtx)
        display(Markdown(f'### Test'))
        test_metrics, labeled_predictions = lab.test(model_report.name, test_inputs[word_vector_no])
        print(f"Accuracy: {test_metrics.accuracy}")
        print(f"Loss: {test_metrics.loss}")
        print(test_metrics.classification_report)
        display(test_metrics.confusion_matrix)
        test_labeled_comments = test_inputs[word_vector_no].labeled_comments
        labeled_comments = LabeledComments(f"{model_report.name}-{word_vector_name}.cnn.csv",test_labeled_comments.comments ,labeled_predictions)
        comments_dao.save_from_origin(labeled_comments, "test.csv")

In [7]:
#%matplotlib notebook
display(Markdown(f'# Convolutional Neural Network'))
word_vectors = wv_rep.get_all()
word_vector_names = []
lab_reports = []

#word_vectors = [word_vectors.__next__()] #comment!!!!!
for i, word_vector in enumerate(word_vectors):
    if "FBCR2013" in word_vector.name: #erase
        continue
    wv_service = WordVectorsService(word_vector)
    #test_inputs.append(ClassifierInput(test_comments, wv_service))
    input = ClassifierInput(comments, wv_service)
    lab_report = classification_lab_report(i+1, input, lab)
    model_architecture_summary(lab_report)
    #word_vector_names.append(word_vector.name)
    #lab_reports.append(lab_report)
'''   
for i, word_vector_name in enumerate(word_vector_names): 
    lab_report = lab_reports[i]
    show_statistics_by_model(i, word_vector_name, lab_report)
'''
    

# Convolutional Neural Network

## 13. Creating models with embeddings: tweets.txt.usu.16_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.903842,0.036859,0.441808,0.034054


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,27e5826572654dea8feafa383f2ea741,0.915076,0.34117,0.398305,1.612852
1,d2b8240288ff4286923877b758877ab4,0.929389,0.317729,0.466102,1.371312
2,56b38d2108254d6cab9257aeac69f1c0,0.857824,0.424208,0.40678,1.554492
3,96622e0859874dd58b079e69ed22a4d1,0.917064,0.334048,0.444444,1.41436
4,8efcd4bfb58240338907c621aea9e926,0.931363,0.310426,0.401709,1.52177
5,c136a57f3cc14a63a7d9e80f935e429c,0.902765,0.353265,0.461538,1.467327
6,85b0dcf88b444623820fa9c4e1d15ba5,0.890476,0.36197,0.508621,1.278828
7,60f37ee09e3a4e2d80b5d6e3bb3c0371,0.85619,0.379118,0.439655,1.555049
8,7988480119b640caa71062821b73dfb2,0.866794,0.426069,0.434783,1.470233
9,333609ff39734b459765eb2b98cf426c,0.971483,0.24052,0.45614,1.355721


## 14. Creating models with embeddings: tweets.txt.usu.6_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.892455,0.063853,0.467214,0.055778


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,dd6b1673eb2c41f1866c91f7715d0778,0.937023,0.283204,0.483051,1.442682
1,1ce43e16c173469a99439a38f801c8c5,0.907443,0.321954,0.567797,1.326335
2,fcd513e5876e423a959cb1467bc0df47,0.927481,0.288777,0.364407,1.643344
3,f321012cbbab450fa8421ea83ba7d039,0.898951,0.356192,0.512821,1.401693
4,d8f3e2a932c7409fbc77d8d18bfbf66a,0.857007,0.399758,0.487179,1.396441
5,d708dbc2021f482f836396f0fd2217b4,0.900858,0.366987,0.495726,1.288342
6,2b710e00d51c4027b0c1bde0b0d23284,0.854286,0.391397,0.431034,1.820984
7,35558b024961415ba353003b964bac80,0.960952,0.275935,0.456897,1.25961
8,259ac53baf4b45748906bbff1e81f44e,0.941009,0.303249,0.452174,1.50949
9,c88fda397f49494bb014dafa61750f64,0.739544,0.566792,0.421053,1.584266


## 15. Creating models with embeddings: tweets.txt.usu.10_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.878837,0.066674,0.433076,0.05003


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,01f33b5bcdc54b6d989cd02a1407d17b,0.895038,0.340197,0.347458,1.575003
1,8bd7220328c240f2aeb3a250374ee67f,0.966603,0.228377,0.516949,1.244017
2,35cdb27558a1487685386006260c3393,0.902672,0.327324,0.457627,1.512805
3,9d9a91fb84a7411289ee9cc0ba93d98d,0.883699,0.392594,0.393162,1.58895
4,50914726103e4732a3e8ed8cc4cf626f,0.857007,0.415825,0.410256,1.547536
5,08231febc88449d2a6af13069ed5bcf2,0.894185,0.358772,0.444444,1.489529
6,25bc5bb7c0cf4d309f113823175e3cfe,0.90381,0.343646,0.465517,1.401103
7,9f35c71370734621930996e2489972b1,0.935238,0.290637,0.474138,1.348453
8,c6fcc8b1771547418d758dd9a0ebfa38,0.723121,0.652494,0.382609,1.815114
9,7d9130cd66c8435d887e18032a7b34aa,0.826996,0.425412,0.438596,1.412348


## 16. Creating models with embeddings: tweets.txt.usu.8_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.879047,0.060169,0.444477,0.044147


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,4ca8e11718fd4c79b655e6240959d21f,0.914122,0.338853,0.474576,1.370304
1,dbe241216f964cb189d7f0d713a6c5c7,0.910305,0.349369,0.40678,1.569619
2,0f6100fe468544128d707860f475e050,0.751908,0.642894,0.40678,1.753419
3,8f2c7467dec84262a1064bf55288b6b3,0.824595,0.47659,0.410256,1.523132
4,1cd8dc17b18e428cbbbfeee4821757a0,0.835081,0.45247,0.367521,1.647467
5,c7361291793f49fb94c884957d457434,0.948522,0.280756,0.495726,1.316995
6,06fbd70034064706b7bdc6267ea52f46,0.860952,0.425865,0.482759,1.408615
7,ca1623cd91914866a39af8290738ef9c,0.908571,0.343941,0.439655,1.49049
8,85d252206e634df08ceec97c0e6949ad,0.907707,0.372695,0.478261,1.402847
9,576f7da939e64afe99b7bf0fad03cd79,0.928707,0.295357,0.482456,1.295941


## 17. Creating models with embeddings: tweets.txt.usu.12_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.911313,0.05942,0.452919,0.042781


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,af874e8bad0e4e82bc53147f05088ecd,0.937023,0.284356,0.40678,1.392993
1,c991e836b561417db0f418dab1d70da3,0.896947,0.351851,0.381356,1.641273
2,3b0a8fd38e774fda94087ce5d41ceb4a,0.969466,0.195637,0.5,1.234384
3,4d643c63f6994b469f5f108213002ed1,0.845567,0.419856,0.521368,1.334866
4,b2fce541da734d50aaf3ba086b2d41ac,0.93899,0.279141,0.42735,1.561772
5,22007665331449498bf9e1fa1a8d8068,0.902765,0.301618,0.444444,1.507577
6,1c3b57e721534fa19e5790b71300abf4,0.932381,0.284024,0.448276,1.512011
7,6187b848d35d4a109c14dfe399dd987b,0.955238,0.255458,0.491379,1.26633
8,9292d706fa944f07b854864fc3f3b693,0.957184,0.243423,0.46087,1.257729
9,8d92b0e687df4c3ba017e8b544cb1013,0.777567,0.501297,0.447368,1.607212


## 18. Creating models with embeddings: tweets.txt.usu.1_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.899934,0.037481,0.47819,0.055906


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,135eab0f15e244b2b12afc39d9f54c61,0.896947,0.389458,0.449153,1.347493
1,93edfae1e9674d0c958928cc3ccbf3f6,0.895038,0.351331,0.398305,1.629091
2,532e73f1d55f4af68fa42e8b780aae4c,0.817748,0.496292,0.423729,1.640844
3,7d291064596f43c7aea47cb13b3c75a8,0.925643,0.308042,0.410256,1.522557
4,3b536e11e63747cc8839a9d494a20c08,0.908484,0.375249,0.504274,1.203712
5,40e877afa5ce4f54a33809df4b252289,0.967588,0.241325,0.470085,1.227509
6,0c0aaf5ea1c549fda7b802d2e8fa01ab,0.892381,0.328754,0.543103,1.344863
7,55353988a4b04b0f804e8324b20ebf81,0.895238,0.359299,0.508621,1.338895
8,780aa8ab213d498298965cc649e8afb5,0.884872,0.370039,0.521739,1.472228
9,c699b8e33d7e475d8dbff379d0dfe73c,0.915399,0.342905,0.552632,1.313474


## 19. Creating models with embeddings: tweets.txt.usu.14_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.901096,0.035229,0.437953,0.043677


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,98b4ba50d62d4515b86f9302a7ef66ee,0.888359,0.374228,0.516949,1.470201
1,c9c4d4d3b3e344f79e96e415a838923e,0.906489,0.326422,0.491525,1.377727
2,e3e04c4f409a44ea9aeda8d4f48891e6,0.95229,0.263521,0.415254,1.491282
3,bbe1c9138a684a64a0772906ee0ca9e7,0.867493,0.415499,0.470085,1.422621
4,d2b51cca57344e1e8788f9e4b1c2e37b,0.925643,0.297555,0.410256,1.481413
5,b649117c57874f79891af3d05e9cac3b,0.916111,0.32358,0.42735,1.439481
6,40a31531d81148fbbf62eaeafeba7e74,0.899048,0.343006,0.456897,1.329594
7,3d0a9861a31a4c5f9c14e88219435d0e,0.895238,0.349663,0.396552,1.771855
8,d0bbc70385c848d69d861d0b84bfba37,0.827783,0.456842,0.408696,1.697973
9,aa4df669a4aa42e0875b0f3edd5afd26,0.93251,0.295514,0.385965,1.443259


## 20. Creating models with embeddings: tweets.txt.usu.9_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.909216,0.060301,0.450892,0.060177


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,3bc9516eabbd4a71b1ef081b9e571e52,0.948473,0.27652,0.5,1.353124
1,6e05f10c70084f05bf1f553e307a77e5,0.948473,0.272345,0.483051,1.217227
2,3cd9901c7461447fa836daecac3b622a,0.916031,0.337184,0.449153,1.37621
3,3b36d002a687464180abaeaf769e8f54,0.826501,0.475629,0.444444,1.653918
4,596c8657a0874f228916d7fd74be84a7,0.943756,0.26933,0.487179,1.47363
5,f5d15b4a9c2941709eb05252e2017301,0.935176,0.302754,0.384615,1.527699
6,ee3bad31c2154315bbbaa4fc39078861,0.952381,0.269284,0.517241,1.318258
7,ae7c734637f148e7a02539c355e7c62f,0.85619,0.415118,0.37931,1.555476
8,788c7a453a0440abb07962cc27b70ffe,0.968601,0.210661,0.513043,1.252511
9,007622e384e84dca91f0e2056694f282,0.796578,0.522401,0.350877,1.751703


## 21. Creating models with embeddings: tweets.txt.usu.18_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.926617,0.028233,0.450234,0.058945


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,b235b7311ff44355a67fea2a8be0d925,0.928435,0.316072,0.542373,1.212983
1,235bb4ed7825413998f1cccadd34463c,0.946565,0.281058,0.491525,1.261005
2,aa06f67cdded43acb873e391f62d1185,0.866412,0.40306,0.432203,1.599961
3,9db452327dd141c79539bcaf41d29e5a,0.953289,0.244097,0.452991,1.388317
4,86779f62be324b3a876f3ec87446cacc,0.905624,0.322501,0.34188,1.67695
5,b8fe9345a95a4adf91030b5c9c0801e8,0.923737,0.287058,0.435897,1.586973
6,3ed9682ceba4462fb4abe4c94b604c1e,0.915238,0.309965,0.37069,1.562155
7,34f9059e7cc5492788994f0c53820554,0.96,0.241012,0.482759,1.309038
8,93bc628538c64e18aedf8ae5ca76b355,0.951475,0.250872,0.469565,1.40848
9,2c4d2c7e28a84dddacf8e30f566a7412,0.915399,0.328161,0.482456,1.427094


## 22. Creating models with embeddings: tweets.txt.usu.3_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.894792,0.053204,0.465067,0.053555


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,6300bb3dd65c43679f768cca9d5fe5f3,0.891221,0.37114,0.40678,1.309361
1,c29edbd646b4444db0c031f087a8bb9e,0.936069,0.271621,0.483051,1.397047
2,9a25e94a75954fa28b7f12cadc0dfd11,0.81584,0.446143,0.474576,1.50515
3,3a6fd2b46d104de7b521872dc72b6c50,0.85796,0.440001,0.418803,1.470021
4,cdda942b1d5144c7bfb08ae0789c7d8a,0.962822,0.211899,0.547009,1.288656
5,a1a79f629b7c4bc8b0635a3c90da4c22,0.908484,0.360538,0.376068,1.518104
6,95f29aa4bbfe45fd80af43212d13dda7,0.892381,0.391387,0.456897,1.427942
7,00a1d67797ef466fa5ce1ccce0b24fa7,0.930476,0.287822,0.474138,1.31612
8,681b799c4bb24a20b45244a492bc1b0b,0.807802,0.480169,0.478261,1.55167
9,bcd3c533abbb41259cb4a735b9b7486e,0.944867,0.26794,0.535088,1.154483


## 23. Creating models with embeddings: tweets.txt.usu.5_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.900928,0.051828,0.448942,0.060686


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,e931ef90780549f3a2c1885d926eb238,0.91126,0.342881,0.466102,1.359312
1,fd83b7cd5bf845e198fdd934b8505529,0.94084,0.264885,0.525424,1.106056
2,53dd274524634463ba0f27153c90af96,0.958015,0.232812,0.576271,1.155552
3,b0c42a990b77474c8ee6a58856e81e80,0.937083,0.273956,0.452991,1.382593
4,286af9a96c9b4899be317c901c97cfb2,0.898951,0.353183,0.42735,1.63402
5,51dc7d5e07e84006af397379082a0c76,0.919924,0.300358,0.435897,1.583312
6,d293840426a44a728274f4bd45c99e88,0.915238,0.292469,0.387931,1.539511
7,874fd5ea39c64923ad33b87f8382812e,0.804762,0.52956,0.413793,1.54521
8,e3a91ee1c9274169b606b0f14d2f4515,0.811608,0.459957,0.382609,1.710582
9,cedd655458974b6db255dbee3efefa54,0.911597,0.292507,0.421053,1.429253


## 24. Creating models with embeddings: tweets.txt.usu.20_w.kvs

### Mean Accuracies and losses

Unnamed: 0,train_acc_mean,train_acc_stdev,val_acc_mean,val_acc_stdev
0,0.92053,0.048849,0.469209,0.054397


### Accuracies and losses by Stratified KFold (folds with balanced class distributions)

Unnamed: 0,model name,train_acc,train_loss,val_acc,val_loss
0,7cc8c0d5aaaa4aafa948bc54cdc22c83,0.951336,0.265301,0.516949,1.408914
1,5efbb9e138b54d57a1e7f99507b88e74,0.881679,0.357819,0.40678,1.573302
2,34c23a5178e14be08f6d5a78afc9962d,0.930344,0.289683,0.5,1.367749
3,dcded2ff78294bdc9f519fbfba848072,0.968541,0.217596,0.504274,1.310742
4,f1061efd485443e6b72e38fe79e1f330,0.91897,0.313576,0.435897,1.468291
5,90c670a37767434ba89948b1bdbe5770,0.954242,0.245901,0.478632,1.365536
6,ab2902e5bac34f41bb2fb75de28fc9f5,0.941905,0.283848,0.456897,1.385558
7,2c3530e3c8b34220b08de601625f7e21,0.821905,0.444479,0.413793,1.640612
8,adab8e5981fc48f48604c5bd2622a96c,0.866794,0.398059,0.408696,1.505969
9,f8179950214841d48ce64d5814fe2d39,0.969582,0.205721,0.570175,1.155118


'   \nfor i, word_vector_name in enumerate(word_vector_names): \n    lab_report = lab_reports[i]\n    show_statistics_by_model(i, word_vector_name, lab_report)\n'