In [1]:
import pandas as pd
import numpy as np
import os, sys 
sys.path.append(os.environ['HOME'] + '/src/models/')
from deeplearning_models import DLTextClassifier
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Experiment 1

Train: C3 Train 
Test: C3 Test 

In [2]:
def run_dl_experiment(C3_train_df, 
                      C3_test_df, 
                      model = 'cnn'):


    """    
    """    
    X_train = C3_train_df['pp_comment_text'].astype(str)
    y_train = C3_train_df['constructive_binary']
    
    X_test = C3_test_df['pp_comment_text'].astype(str)
    y_test = C3_test_df['constructive_binary']
    
    dlclf = DLTextClassifier(X_train, y_train)
    
    if model.endswith('lstm'):
        dlclf.build_bilstm()
        
    elif model.endswith('cnn'): 
        dlclf.build_cnn()
        
    dlclf.train(X_train, y_train)
    print('\nTrain results: \n\n')
    dlclf.evaluate(X_train, y_train)
    
    print('\nTest results: \n\n')
    dlclf.evaluate(X_test, y_test)
    results_df = dlclf.write_model_scores_df(C3_test_df)

### Experiment 1

- Train: C3 Train (80%)
- Test: C3 Test (20%)

In [3]:
C3_train_df = pd.read_csv(os.environ['C3_TRAIN'])
C3_test_df = pd.read_csv(os.environ['C3_TEST'])

In [5]:
run_dl_experiment(C3_train_df, C3_test_df, model = 'lstm')

len of encoded docs:  9600
Pad sequences (samples x time)
Padded data shape: (9600, 100)
Number of words not found in glove embeddings:  655
Percentage non-zero elements:  0.9757530955461098
Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          8116500   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               439296    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 8,556,053
Trainable params: 439,553
Non-trainable params: 8,116,500
_________________________________________________________________
None
Training...
Train on 8640 samples, validate 

In [4]:
run_dl_experiment(C3_train_df, C3_test_df, model = 'cnn')

len of encoded docs:  9600
Pad sequences (samples x time)
Padded data shape: (9600, 100)
Number of words not found in glove embeddings:  655
Percentage non-zero elements:  0.9757530955461098
Building CNN model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          8116500   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 49, 250)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 47, 250)           187750    
_____________________________________________________________

### Experiment 2

- Train: C3_MINUS_LB + C3-LB train (80%)
- Test: C3 Test 

In [5]:
C3_train_df = pd.read_csv(os.environ['C3_MINUS_LB'])
C3_test_df = pd.read_csv(os.environ['C3_LB'])

In [6]:
C3_train_df.columns

Index(['comment_counter', 'pp_comment_text', 'constructive',
       'constructive_binary'],
      dtype='object')

In [7]:
feats = ['comment_counter', 'pp_comment_text', 'constructive']
LB_X_train, LB_X_test, LB_y_train, LB_y_test = train_test_split(C3_test_df[feats], C3_test_df['constructive_binary'], train_size = 0.80, random_state=1)



In [8]:
inter_df = pd.concat([LB_X_train, LB_y_train], axis = 1)

In [9]:
inter_df.head()

Unnamed: 0,comment_counter,pp_comment_text,constructive,constructive_binary
705,source1_24211771_152,King Ralph and many of his supporters have die...,0.2,0.0
341,source1_26845638_31,canada did the right things during the economi...,0.6,1.0
662,source1_14853747_24,One would think Phil Fontaine would be more co...,0.2,0.0
577,source2_26842506_870,Makes you wonder what the Harper government wo...,0.2,0.0
671,source1_21138349_21,No one can tell me that there are n't plenty e...,0.2,0.0


In [10]:
C3_train_df.head()

Unnamed: 0,comment_counter,pp_comment_text,constructive,constructive_binary
0,source1_26023945_62,And this Conservative strategy has produced th...,1.0,1.0
1,source1_24565777_106,I commend Harper for holding the debates outsi...,1.0,1.0
2,source1_28775443_136,What a joke Rachel Notley is . This is what wa...,1.0,1.0
3,source1_8996700_50,Do you need to write an essay to prove the poi...,1.0,1.0
4,source1_29405071_126,Rob Ford was no saint . He should never have b...,1.0,1.0


In [11]:
train_df = pd.concat([C3_train_df, inter_df])

In [12]:
train_df.shape

(11670, 4)

In [13]:
test_df = pd.concat([LB_X_test, LB_y_test], axis = 1)

In [14]:
run_dl_experiment(train_df, test_df, model = 'cnn')

len of encoded docs:  11670
Pad sequences (samples x time)
Padded data shape: (11670, 100)
Number of words not found in glove embeddings:  795
Percentage non-zero elements:  0.9729408165346568
Building CNN model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          8825100   
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 49, 250)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 47, 250)           187750    
___________________________________________________________

In [15]:
run_dl_experiment(train_df, test_df, model = 'lstm')

len of encoded docs:  11670
Pad sequences (samples x time)
Padded data shape: (11670, 100)
Number of words not found in glove embeddings:  795
Percentage non-zero elements:  0.9729408165346568
Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 300)          8825100   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               439296    
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 257       
Total params: 9,264,653
Trainable params: 439,553
Non-trainable params: 8,825,100
_________________________________________________________________
None
Training...
Train on 10503 samples, valida

### Experiment 3

- Train: LB Train (80%)
- Test: LB test (20%)

In [16]:
LB_train_df = pd.concat([LB_X_train, LB_y_train], axis = 1)
LB_test_df = pd.concat([LB_X_test, LB_y_test], axis = 1)

In [17]:
run_dl_experiment(LB_train_df, LB_test_df, model = 'cnn')

len of encoded docs:  1318
Pad sequences (samples x time)
Padded data shape: (1318, 100)
Number of words not found in glove embeddings:  72
Percentage non-zero elements:  0.9909395556658807
Building CNN model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 300)          2417100   
_________________________________________________________________
dropout_6 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 49, 250)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 47, 250)           187750    
______________________________________________________________

In [18]:
run_dl_experiment(LB_train_df, LB_test_df, model = 'lstm')

len of encoded docs:  1318
Pad sequences (samples x time)
Padded data shape: (1318, 100)
Number of words not found in glove embeddings:  72
Percentage non-zero elements:  0.9909395556658807
Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 300)          2417100   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 256)               439296    
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 257       
Total params: 2,856,653
Trainable params: 439,553
Non-trainable params: 2,417,100
_________________________________________________________________
None
Training...
Train on 1186 samples, validate o

### Experiment 4

- Train: LB Train (80%)
- Test:  C3 test (20%)

In [19]:
run_dl_experiment(LB_train_df, C3_test_df, model = 'cnn')

len of encoded docs:  1318
Pad sequences (samples x time)
Padded data shape: (1318, 100)
Number of words not found in glove embeddings:  72
Percentage non-zero elements:  0.9909395556658807
Building CNN model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 300)          2417100   
_________________________________________________________________
dropout_9 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 49, 250)           0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 47, 250)           187750    
______________________________________________________________

In [20]:
run_dl_experiment(LB_train_df, C3_test_df, model = 'lstm')

len of encoded docs:  1318
Pad sequences (samples x time)
Padded data shape: (1318, 100)
Number of words not found in glove embeddings:  72
Percentage non-zero elements:  0.9909395556658807
Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 300)          2417100   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               439296    
_________________________________________________________________
dropout_11 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 257       
Total params: 2,856,653
Trainable params: 439,553
Non-trainable params: 2,417,100
_________________________________________________________________
None
Training...
Train on 1186 samples, validate o