In [685]:
from utils import go_to_project_root
from scipy.stats import mode
import data
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold, train_test_split
from matplotlib import pyplot as plt

from sklearn.neural_network import MLPClassifier, BernoulliRBM
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import NuSVC
import keras
from sklearn.feature_selection import RFE
import tensorflow as tf

In [686]:
def classify(classifier, xtrain, ytrain, xtest, ytest):
    classifier.fit(xtrain, ytrain)
    pred = predict(classifier, xtest)
    return balanced_accuracy_score(ytest, pred)

def feature_elim(classifier, xtrain, ytrain):
    rfe = RFE(estimator=classifier, n_features_to_select=20, step=1)
    rfe.fit(xtrain, ytrain)
    return rfe.ranking_

def read_data(_path):
    path = data_root + _path
    xtrain = pd.read_csv(path + "X_train.csv", index_col=0).to_numpy()[:200].astype(float)
    ytrain = pd.read_csv(path + "y_train.csv", index_col=0).to_numpy()[:200]
    ytest = pd.read_csv(path + "y_test.csv", index_col=0).to_numpy()
    xtest = []
    for file in os.listdir(path + "X_test/"):
        xtest += [pd.read_csv(path + "X_test/" + file, index_col=0).to_numpy()]

    new_xt = get_new_xtests(_path)
    mean_xt = [np.mean(new_xt[0], axis=0) for xt in new_xt]

    for i, x in enumerate(xtest):
        xtest[i][:,[2, 3, 4, 5]] = mean_xt[i]

    return xtrain, ytrain, xtest, ytest

def predict(classifier, xtest):
    majority_vote_preds = []
    for x in xtest:
        x = np.delete(x, [2, 3, 4, 5], 1)
        majority_vote_preds += [np.sum(classifier.predict(x).astype(int)) > 1]
    return majority_vote_preds

def get_new_xtests(path): 
    path = data_root[:-1] + "s/" + path
    xtest = []
    for file in os.listdir(path + "X_test/"):
        xtest += [pd.read_csv(path + "X_test/" + file, index_col=0).to_numpy()[:, [2, 3, 4, 5]]]
    return xtest

In [687]:
go_to_project_root()
data_root = "data/processed/800/"
datasets = [read_data(f"K{k+1}/") for k in range(3)]

In [688]:
#ndims = xtrain.shape[1]
def my_model(ndims):
    return keras.models.Sequential([
        keras.layers.Dense(150, # Number of hidden neurons 
                            input_shape=(ndims,),
                              activation='relu', 
                              name='hidden_layer'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, # Number of hidden neurons 
                              activation='sigmoid', 
                              name='output_layer') ])

#model = my_model(name='training')

print(model.summary())

Model: "sequential_445"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden_layer (Dense)         (None, 150)               53400     
_________________________________________________________________
dropout_21 (Dropout)         (None, 150)               0         
_________________________________________________________________
output_layer (Dense)         (None, 1)                 151       
Total params: 53,551
Trainable params: 53,551
Non-trainable params: 0
_________________________________________________________________
None


In [689]:
#ndims = xtrain.shape[1]
def my_model(ndims):
    return keras.models.Sequential([
        keras.layers.Dense(1, # Number of hidden neurons 
                           input_shape=(ndims,),
                              activation='sigmoid', 
                              name='output_layer') ])

#model = my_model(name='training')

print(model.summary())

Model: "sequential_445"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden_layer (Dense)         (None, 150)               53400     
_________________________________________________________________
dropout_21 (Dropout)         (None, 150)               0         
_________________________________________________________________
output_layer (Dense)         (None, 1)                 151       
Total params: 53,551
Trainable params: 53,551
Non-trainable params: 0
_________________________________________________________________
None


In [690]:
feature_sets = {
    "lexical": [0, 1],
    "semantic": [2, 3, 4, 5],
    "clusters": [6, 7],
    "nonling": list(range(8, 29)),
    "pos": list(range(29, 47)),
    "emotion": list(range(47, 57)),
    "embeddings": list(range(57, 357))
}

In [692]:
fs = feature_sets["clusters"]
for i in range(3):
    xtrain, ytrain, xtest, ytest = datasets[i]
    #xtrain = np.delete(xtrain, [2,3,4,5], 1)
    #for i in range(len(xtest)):
    #    xtest[i] = np.delete(xtest[i], np.arange(2,6), axis=1)
    model = my_model(ndims=np.delete(xtrain, fs, axis=1).shape[1])
    # Define optimization algorithm
    adam = tf.optimizers.Adam(lr=0.01)

    # Compile model (i.e., build compute graph)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['binary_accuracy'])
    history = model.fit(np.delete(xtrain, fs, axis=1), ytrain, epochs=20, verbose=False)
    preds = []

    for i in range(len(xtest)):
        ones = 0
        zeros = 0
        for result in model.predict(np.delete(xtest[i], fs, axis=1)):
            if result > 0.5 :
                ones += 1
            else:
                zeros += 1
        if ones > zeros:
            preds.append(1)
        else:
            preds.append(0)
    print(balanced_accuracy_score(ytest, preds))

ValueError: could not convert string to float: 'None'

In [638]:
fs = feature_sets["emotion"] + feature_sets["embeddings"]
for i in range(3):
    xtrain, ytrain, xtest, ytest = datasets[i]
    #xtrain = np.delete(xtrain, [2,3,4,5], 1)
    #for i in range(len(xtest)):
    #    xtest[i] = np.delete(xtest[i], np.arange(2,6), axis=1)
    model = my_model(ndims=xtrain[:,fs].shape[1])
    # Define optimization algorithm
    adam = tf.optimizers.Adam(lr=0.001)

    # Compile model (i.e., build compute graph)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['binary_accuracy'])
    history = model.fit(xtrain[:,fs], ytrain, epochs=10, verbose=False)
    preds = []

    for i in range(len(xtest)):
        ones = 0
        zeros = 0
        for result in model.predict(xtest[i][:,fs]):
            if result > 0.5 :
                ones += 1
            else:
                zeros += 1
        if ones > zeros:
            preds.append(1)
        else:
            preds.append(0)
    print(balanced_accuracy_score(ytest, preds))

0.71
0.66
0.64


In [691]:
for i in range(3):
    xtrain, ytrain, xtest, ytest = datasets[i]
    #xtrain = np.delete(xtrain, [2,3,4,5], 1)
    #for i in range(len(xtest)):
    #    xtest[i] = np.delete(xtest[i], np.arange(2,6), axis=1)
    model = my_model(ndims=xtrain.shape[1])
    # Define optimization algorithm
    adam = tf.optimizers.Adam(lr=0.001)

    # Compile model (i.e., build compute graph)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['binary_accuracy'])
    history = model.fit(xtrain, ytrain, epochs=20, verbose=False)
    preds = []

    for i in range(len(xtest)):
        ones = 0
        zeros = 0
        for result in model.predict(xtest[i]):
            if result > 0.5 :
                ones += 1
            else:
                zeros += 1
        if ones > zeros:
            preds.append(1)
        else:
            preds.append(0)
    print(balanced_accuracy_score(ytest, preds))

ValueError: could not convert string to float: 'None'

In [574]:
import scipy.stats
preds = []

for i in range(len(xtest)):
    ones = 0
    zeros = 0
    for result in model.predict(xtest[i]):
        if result > 0.5 :
            ones += 1
        else:
            zeros += 1
    if ones > zeros:
        preds.append(1)
    else:
        preds.append(0)

ValueError: Error when checking input: expected hidden_layer_input to have shape (300,) but got array with shape (357,)

In [514]:
balanced_accuracy_score(ytest, preds)

0.69

In [705]:
wrong = [0,
 1,
 9,
 12,
 13,
 14,
 15,
 16,
 22,
 25,
 26,
 30,
 33,
 36,
 40,
 41,
 42,
 43,
 51,
 53,
 58,
 64,
 66,
 69,
 70,
 72,
 80,
 90,
 94,
 95,
 98]

In [717]:
import os
go_to_project_root()
IDS= os.listdir("data/processed/800s/K3/X_test/")
IDS_wrong = []
for i in wrong:
    IDS_wrong.append(IDS[i][:-4])

In [712]:
Authors = data.get_processed_data()

In [718]:
for ID in IDS_wrong:
    print(Authors[ID].truth)
    print(Authors[ID].tweets[:5])
    print()

0
['Amber Smith “Kandy Halloween: Return of the Haunted Mansion” Red Carpet #URL# via #USER#', "Kourtney Kardashian Reveals Scott Disick 'Checks in' With Her Every Single Day #URL#", 'Serena Williams Steps Out at the 2019 Met Gala — See the Tennis Pro’s Gorgeous Ensemble! #URL#', 'See the campiest looks from 2019 Met Gala pink carpet #URL#', 'Trouble in Paradise? Kris Jenner and Corey Gamble Had ‘Zero Chemistry’ at Met Gala #URL#']

0
['Rachel Bilson Was Asked Point-Blank if She’s Dating Nick Viall, and She Gave the Most Cryptic Answer - Cosmopolitan #URL#', 'Bill Hader and Rachel Bilson make red carpet debut at Golden Globes - Citizentribune #URL# #URL#', 'What Do The Stars Say About Rachel Bilson? (12/09/2019) - Tallahasseescene #URL#', 'Celebrities front row at NYFW 2020 - Page Six #URL# #URL#', 'Rachel Bilson dating Bill Hader? - Arizona Daily Star #URL# #URL#']

1
["Hands-on with the Samsung Galaxy S20 lineup #URL#   We got our hands on Samsung's latest flagship… #URL#", 'No more 