In [1]:
import pandas as pd
import numpy as np
import os, sys 
sys.path.append(os.environ['HOME'] + '/src/models/')
from deeplearning_models import DLTextClassifier
from feature_based_models import FBConstructivenessClassifier
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

# classifiers / models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

# other
from sklearn.preprocessing import normalize
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
import nltk
import time

import xgboost as xgb
from sklearn.metrics import f1_score, classification_report

In [3]:
def show_scores(model, X_train, y_train, X_valid, y_valid):
    """
    """
    print("Training accuracy:   %.2f" % (model.score(X_train, y_train)))
    print("Validation accuracy: %.2f" % (model.score(X_valid, y_valid)))
    predictions = list(model.predict(X_train))
    true_labels = y_train.tolist()
    print('TRAIN CLASSIFICATION REPORT\n\n', classification_report(true_labels, predictions))
    
    predictions = list(model.predict(X_valid))
    true_labels = y_valid.tolist()
    print('VALIDATION CLASSIFICATION REPORT\n\n', classification_report(true_labels, predictions))

### Read train test data

In [4]:
C3_train_df = pd.read_csv(os.environ['C3_TRAIN'])
C3_train_df['pp_comment_text'] = C3_train_df['pp_comment_text'].astype(str)

C3_test_df = pd.read_csv(os.environ['C3_TEST'])
C3_test_df['pp_comment_text'] = C3_test_df['pp_comment_text'].astype(str)

In [5]:
C3_train_df.columns

Index(['article_id', 'comment_author', 'comment_counter', 'comment_text',
       'njudgements_constructiveness_expt', 'njudgements_toxicity_expt',
       'agree_constructiveness_expt', 'agree_toxicity_expt', 'constructive',
       'crowd_toxicity_level', 'has_content', 'crowd_discard',
       'constructive_characteristics', 'non_constructive_characteristics',
       'toxicity_characteristics', 'crowd_comments_constructiveness_expt',
       'crowd_comments_toxicity_expt', 'other_con_chars', 'other_noncon_chars',
       'other_toxic_chars', 'SEVERE_TOXICITY_probability',
       'SEXUALLY_EXPLICIT_probability', 'TOXICITY_probability',
       'TOXICITY_IDENTITY_HATE_probability', 'TOXICITY_INSULT_probability',
       'TOXICITY_OBSCENE_probability', 'TOXICITY_THREAT_probability',
       'ATTACK_ON_AUTHOR_probability', 'ATTACK_ON_COMMENTER_probability',
       'ATTACK_ON_PUBLISHER_probability', 'INCOHERENT_probability',
       'INFLAMMATORY_probability', 'LIKELY_TO_REJECT_probability',
     

In [6]:
y_C3_train = C3_train_df.constructive_binary     
X_C3_train = C3_train_df.drop(['constructive_binary'], axis = 1)

#SOCC_a_df = pd.read_csv('/home/vkolhatk/dev/constructiveness//data/external/SOCC//annotated/constructiveness/SFU_constructiveness_toxicity_corpus_preprocessed.csv')    

#X_SOCC_a = SOCC_a_df['pp_comment_text'].astype(str)
#y_SOCC_a = SOCC_a_df['is_constructive']

y_C3_test = C3_test_df.constructive_binary     
X_C3_test = C3_test_df.drop(['constructive_binary'], axis = 1)

### Fit dummy classifier

In [7]:
feature_set = ['length_feats']

classifier = FBConstructivenessClassifier(X_C3_train, y_C3_train, X_C3_test, y_C3_test)
pipeline = classifier.train_pipeline(classifier = DummyClassifier(), feature_set = feature_set)#'ngram_feats', 'tfidf_feats', 'pos_feats'])

classifier.show_scores(pipeline)

Classifier:  DummyClassifier(constant=None, random_state=None, strategy='stratified')
Feature set:  ['length_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   0.50
Validation accuracy: 0.52
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.45      0.45      0.45      4391
        1.0       0.54      0.54      0.54      5209

avg / total       0.50      0.50      0.50      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.45      0.44      0.45      1093
        1.0       0.54      0.55      0.55      1307

avg / total       0.50      0.50      0.50      2400

sklearn micro-F1-Score: 0.5029166666666667


### Train on C3 train and test on SOCC_a 

In [8]:
# Test corpus 
SOCC_a_df = pd.read_csv(os.environ['SOCC_ANNOTATED_FEATS_PREPROCESSED'])
SOCC_a_df['pp_comment_text'] = SOCC_a_df['pp_comment_text'].astype(str)

In [9]:
y_SOCC_a = SOCC_a_df['constructive']
X_SOCC_a = SOCC_a_df.drop(['constructive'], axis = 1)
feature_set = ['length_feats']

In [10]:
models = {'logistic regression': LogisticRegression, 
          'SVM' : SGDClassifier, 
          'random forest' : RandomForestClassifier, 
          'xgboost' : xgb.XGBClassifier
         }

classifier = FBConstructivenessClassifier(X_C3_train, y_C3_train, X_SOCC_a, y_SOCC_a)

for model_name, model_class in models.items():
    t = time.time()
    print(model_name, ":")
    m = model_class()
    pipeline = classifier.train_pipeline(classifier = model_class())    
    classifier.show_scores(pipeline)
    elapsed_time = time.time() - t
    print("Elapsed time: %.1f s" % elapsed_time)
    print()

SVM :
Classifier:  SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
Feature set:  ['ngram_feats', 'tfidf_feats', 'length_feats', 'argumentation_feats', 'text_quality_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
COMMENTS COL:  pp_comment_text




Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Validation accuracy: 0.87
TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.99      1.00      1.00      4391
        1.0       1.00      1.00      1.00      5209

avg / total       1.00      1.00      1.00      9600

VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.83      0.92      0.87       484
        1.0       0.92      0.83      0.87       551

avg / total       0.88      0.87      0.87      1035

sklearn micro-F1-Score: 0.8714975845410629
Elapsed time: 41.5 s

random forest :
Classifier:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
  

  if diff:


Training accuracy:   0.95


  if diff:


Validation accuracy: 0.88


  if diff:


TRAIN CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.94      0.95      0.95      4391
        1.0       0.96      0.95      0.96      5209

avg / total       0.95      0.95      0.95      9600



  if diff:


VALIDATION CLASSIFICATION REPORT

              precision    recall  f1-score   support

        0.0       0.83      0.93      0.88       484
        1.0       0.93      0.83      0.88       551

avg / total       0.88      0.88      0.88      1035

sklearn micro-F1-Score: 0.8772946859903382
Elapsed time: 100.8 s

logistic regression :
Classifier:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Feature set:  ['ngram_feats', 'tfidf_feats', 'length_feats', 'argumentation_feats', 'text_quality_feats', 'named_entity_feats', 'perspective_content_value_feats', 'perspective_aggressiveness_feats', 'perspecitive_toxicity_feats']
COMMENTS COL:  pp_comment_text
Model trained and pickled in file:  /home/vkolhatk/dev/constructiveness/models/saved_model.h5
Training accuracy:   1.00
Val

In [25]:
train_df = pd.concat([X_C3_train, y_C3_train], axis = 1)
test_df = pd.concat([X_SOCC_a, y_SOCC_a], axis = 1)

In [26]:
def run_dl_experiment(C3_train_df, 
                      C3_test_df, 
                      results_csv_path = os.environ['HOME'] + 'models/test_predictions.csv',                       
                      model = 'cnn'):


    """    
    """    
    X_train = C3_train_df['pp_comment_text'].astype(str)
    y_train = C3_train_df['constructive_binary']
    
    X_test = C3_test_df['pp_comment_text'].astype(str)
    y_test = C3_test_df['constructive_binary']
    
    dlclf = DLTextClassifier(X_train, y_train)
    
    if model.endswith('lstm'):
        dlclf.build_bilstm()
        
    elif model.endswith('cnn'): 
        dlclf.build_cnn()
        
    dlclf.train(X_train, y_train)
    print('\nTrain results: \n\n')
    dlclf.evaluate(X_train, y_train)
    
    print('\nTest results: \n\n')
    dlclf.evaluate(X_test, y_test)
    results_df = dlclf.write_model_scores_df(C3_test_df, results_csv_path)

In [29]:
test_df = test_df.rename({'constructive':'constructive_binary'}, axis='columns')

In [30]:
test_df.columns

Index(['article_id', 'comment_counter', 'comment_text', 'crowd_toxicity_level',
       'SEVERE_TOXICITY_probability', 'SEXUALLY_EXPLICIT_probability',
       'TOXICITY_probability', 'TOXICITY_IDENTITY_HATE_probability',
       'TOXICITY_INSULT_probability', 'TOXICITY_OBSCENE_probability',
       'TOXICITY_THREAT_probability', 'ATTACK_ON_AUTHOR_probability',
       'ATTACK_ON_COMMENTER_probability', 'ATTACK_ON_PUBLISHER_probability',
       'INCOHERENT_probability', 'INFLAMMATORY_probability',
       'LIKELY_TO_REJECT_probability', 'OBSCENE_probability',
       'OFF_TOPIC_probability', 'SPAM_probability',
       'UNSUBSTANTIAL_probability', 'source',
       'njudgements_constructiveness_expt', 'njudgements_toxicity_expt',
       'has_conjunctions_and_connectives', 'has_stance_adverbials',
       'has_reasoning_verbs', 'has_modals', 'has_shell_nouns', 'length',
       'average_word_length', 'ncaps', 'noov', 'readability_score',
       'personal_exp_score', 'named_entity_count', 'nSents',

In [31]:
run_dl_experiment(train_df, test_df, model = 'cnn')

len of encoded docs:  9600
Pad sequences (samples x time)
Padded data shape: (9600, 100)
Number of words not found in glove embeddings:  655
Percentage non-zero elements:  0.9757530955461098
Building CNN model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          8116500   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 98, 250)           225250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 49, 250)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 47, 250)           187750    
_____________________________________________________________

In [32]:
run_dl_experiment(train_df, test_df, model = 'lstm')

len of encoded docs:  9600
Pad sequences (samples x time)
Padded data shape: (9600, 100)
Number of words not found in glove embeddings:  655
Percentage non-zero elements:  0.9757530955461098
Building model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 300)          8116500   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               439296    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 8,556,053
Trainable params: 439,553
Non-trainable params: 8,116,500
_________________________________________________________________
None
Training...
Train on 8640 samples, validate 