In [1]:
%load_ext autoreload
%autoreload 2

!pip install vaderSentiment

GITLAB_USER = 'thomasdi'
GITLAB_BRANCH = 'td_experiments'

import pandas as pd
import matplotlib.pylab as plt
from getpass import getpass
from urllib.parse import quote
from google.colab import drive
import os
import numpy as np

drive.mount('/content/gdrive')

if not os.path.isdir('nlu_project_2'):
    passwd = quote(getpass('Enter GitLab password'))
    ! git clone https://$GITLAB_USER:$passwd@gitlab.ethz.ch/thomasdi/nlu_project_2.git

! cd nlu_project_2/ && git checkout $GITLAB_BRANCH && git pull

import sys
sys.path.append('nlu_project_2')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Already on 'td_experiments'
Your branch is up to date with 'origin/td_experiments'.
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0)[K
Unpacking objects: 100% (3/3), done.
From https://gitlab.ethz.ch/thomasdi/nlu_project_2
   a53a64b..8dee31a  td_experiments -> origin/td_experiments
Updating a53a64b..8dee31a
Fast-forward
 models.py | 14 [32m+++++++++++[m[31m---[m
 1 file changed, 11 insertions(+), 3 deletions(-)


In [0]:
import os
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras import models, layers
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import pandas as pd

PREFIX = '/content/gdrive/My Drive/colab_output/experiment-8'

################################################################################

os.makedirs(OUTPUT_DIR, exist_ok=True)

X_train_tasks = np.loadtxt(os.path.join(prefix, 'dev-transform-features.tsv'))
X_train_hidden = np.load(os.path.join(prefix, 'dev-transform-hidden.npy'))
y_train = np.loadtxt(os.path.join(prefix, 'dev-transform-labels.tsv'))

X_test_tasks = np.loadtxt(os.path.join(prefix, 'test-transform-features.tsv'))
X_test_hidden = np.load(os.path.join(prefix, 'test-transform-hidden.npy'))
y_test = np.loadtxt(os.path.join(prefix, 'test-transform-labels.tsv'))

X_report_tasks = np.loadtxt(os.path.join(prefix, 'report-transform-features.tsv'))
X_report_hidden = np.load(os.path.join(prefix, 'report-transform-hidden.npy'))

def build_model(n_hidden=1000, dropout=0.4):
    model = models.Sequential()
    model.add(layers.Dense(n_hidden))
    model.add(layers.Dropout(dropout))
    model.add(layers.Activation('relu'))
    model.add(layers.Dense(1))
    model.add(layers.Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def select_features(feature_type, X_tasks, X_hidden):
    if feature_type == 'proba+sentiment':
        return X_tasks
    elif feature_type == 'proba':
        return X_tasks[:, :6]
    elif feature_type == 'sentiment':
        return X_tasks[:, 6:]
    elif feature_type == 'hidden':
        return X_hidden
    elif feature_type == 'hidden+sentiment':
        return np.concatenate([X_hidden, X_tasks[:, 6:]], axis=1)
    elif feature_type == 'hidden+proba+sentiment':
        return np.concatenate([X_hidden, X_tasks], axis=1)
    else:
        raise NotImplemented()
        
feature_types = ['proba', 'sentiment', 'hidden', 'proba+sentiment', 'hidden+sentiment', 'hidden+proba+sentiment']
classifiers = [
    ('gnb', 'GaussianNB', lambda: GaussianNB()),
    ('svc', 'SVC(gamma=auto)', lambda: SVC(gamma='auto')),
    ('rfc', 'RandomForestClassifier(n_estimators=100)', lambda: RandomForestClassifier(n_estimators=100)),
    ('nnc', 'NN(n_hidden=500, dropout=0.4)', lambda: KerasClassifier(build_model, epochs=5, batch_size=5, shuffle=True, n_hidden=500, verbose=0, dropout=0.4)),
    ('nnc', 'NN(n_hidden=1000, dropout=0.4)', lambda: KerasClassifier(build_model, epochs=5, batch_size=5, shuffle=True, n_hidden=1000, verbose=0, dropout=0.4))
]

In [132]:
result = []
for model_shortname, model_name, model_fn in classifiers:
    model = model_fn()
    for feature_type in feature_types:
        print('*** evaluating %s/%s...' % (model_name, feature_type))
        X_train = select_features(feature_type, X_train_tasks, X_train_hidden)
        X_test = select_features(feature_type, X_test_tasks, X_test_hidden)
        X_report = select_features(feature_type, X_report_tasks, X_report_hidden)
        
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        y_report_pred = model.predict(X_report)
        
        np.savetxt(os.path.join(PREFIX, 'classifier-predictions-%s.tsv' % model_shortname),
                   y_report_pred.squeeze(), fmt='%d')
        
        acc_train = accuracy_score(y_train, y_train_pred)
        acc_test = accuracy_score(y_test, y_test_pred)
        
        result.append({
            'classifier': model_name,
            'features': feature_type,
            'acc_train': acc_train,
            'acc_test': acc_test
        })
        
df = pd.DataFrame.from_records(result, columns=['classifier', 'features', 'acc_train', 'acc_test'])
df.to_csv(os.path.join(PREFIX, 'classifier-accuracies.tsv'), sep='\t')
df

*** evaluating GaussianNB/proba...
*** evaluating GaussianNB/sentiment...
*** evaluating GaussianNB/hidden...
*** evaluating GaussianNB/proba+sentiment...
*** evaluating GaussianNB/hidden+sentiment...
*** evaluating GaussianNB/hidden+proba+sentiment...
*** evaluating SVC(gamma=auto)/proba...
*** evaluating SVC(gamma=auto)/sentiment...
*** evaluating SVC(gamma=auto)/hidden...
*** evaluating SVC(gamma=auto)/proba+sentiment...
*** evaluating SVC(gamma=auto)/hidden+sentiment...
*** evaluating SVC(gamma=auto)/hidden+proba+sentiment...
*** evaluating RandomForestClassifier(n_estimators=100)/proba...
*** evaluating RandomForestClassifier(n_estimators=100)/sentiment...
*** evaluating RandomForestClassifier(n_estimators=100)/hidden...
*** evaluating RandomForestClassifier(n_estimators=100)/proba+sentiment...
*** evaluating RandomForestClassifier(n_estimators=100)/hidden+sentiment...
*** evaluating RandomForestClassifier(n_estimators=100)/hidden+proba+sentiment...
*** evaluating NN(n_hidden=500,

Unnamed: 0,classifier,features,acc_train,acc_test
0,GaussianNB,proba,0.660075,0.632817
1,GaussianNB,sentiment,0.592197,0.59861
2,GaussianNB,hidden,0.655799,0.629075
3,GaussianNB,proba+sentiment,0.657937,0.649385
4,GaussianNB,hidden+sentiment,0.655265,0.630144
5,GaussianNB,hidden+proba+sentiment,0.659006,0.634955
6,SVC(gamma=auto),proba,0.970604,0.560128
7,SVC(gamma=auto),sentiment,0.599679,0.586317
8,SVC(gamma=auto),hidden,0.703367,0.642972
9,SVC(gamma=auto),proba+sentiment,0.947622,0.584714


In [134]:
df.sort_values(by='acc_test', ascending=False)

Unnamed: 0,classifier,features,acc_train,acc_test
22,"NN(n_hidden=500, dropout=0.4)",hidden+sentiment,0.863175,0.687867
11,SVC(gamma=auto),hidden+proba+sentiment,0.77178,0.685195
23,"NN(n_hidden=500, dropout=0.4)",hidden+proba+sentiment,0.774452,0.673437
27,"NN(n_hidden=1000, dropout=0.4)",proba+sentiment,0.669161,0.669161
28,"NN(n_hidden=1000, dropout=0.4)",hidden+sentiment,0.86852,0.668092
21,"NN(n_hidden=500, dropout=0.4)",proba+sentiment,0.668626,0.663282
20,"NN(n_hidden=500, dropout=0.4)",hidden,0.830037,0.651523
3,GaussianNB,proba+sentiment,0.657937,0.649385
29,"NN(n_hidden=1000, dropout=0.4)",hidden+proba+sentiment,0.72047,0.644575
17,RandomForestClassifier(n_estimators=100),hidden+proba+sentiment,1.0,0.644575
