In [115]:
from utils import go_to_project_root
from scipy.stats import mode
import data
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold, train_test_split
from matplotlib import pyplot as plt

from sklearn.neural_network import MLPClassifier, BernoulliRBM
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import NuSVC
import keras
from sklearn.feature_selection import RFE

In [120]:
def classify(classifier, xtrain, ytrain, xtest, ytest):
    classifier.fit(xtrain, ytrain)
    pred = predict(classifier, xtest)
    return balanced_accuracy_score(ytest, pred)

def feature_elim(classifier, xtrain, ytrain):
    rfe = RFE(estimator=classifier, n_features_to_select=20, step=1)
    rfe.fit(xtrain, ytrain)
    return rfe.ranking_

def read_data(path):
    path = data_root + path
    xtrain = pd.read_csv(path + "X_train.csv", index_col=0).to_numpy()
    ytrain = pd.read_csv(path + "y_train.csv", index_col=0).to_numpy()
    ytest = pd.read_csv(path + "y_test.csv", index_col=0).to_numpy()
    xtest = []
    for file in os.listdir(path + "X_test/"):
        xtest += [pd.read_csv(path + "X_test/" + file, index_col=0).to_numpy()]
    return xtrain, ytrain, xtest, ytest

def predict(classifier, xtest):
    majority_vote_preds = []
    for x in xtest:
        x = np.delete(x, [2, 3, 4, 5], 1)
        majority_vote_preds += [np.sum(classifier.predict(x).astype(int)) > 1]
    return majority_vote_preds

In [74]:
go_to_project_root()
data_root = "data/processed/800/"
datasets = [read_data(f"K{k+1}/") for k in range(3)]

In [121]:
for i in range(1):
    xtrain, ytrain, xtest, ytest = datasets[i]
    # Delete semantic similarity :(
    xtrain = np.delete(xtrain, [2, 3, 4, 5], 1)
    # print(classify(RandomForestClassifier(), xtrain, ytrain, xtest, ytest))
    selected = np.where(feature_elim(RandomForestClassifier(), xtrain, ytrain) == 1)

In [128]:
print(classify(RandomForestClassifier(), xtrain[:, list(selected)].reshape((800,20)), ytrain, xtest, ytest))

(800, 20)

In [90]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [98]:
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(keras.layers.GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(keras.layers.Dense(1, activation='tanh'))
model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = keras.callbacks.ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

history = model.fit(X_train, y_train, batch_size=10, epochs=20, validation_data=(X_test, y_test), callbacks=[es, mc])

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

# Evaluate testing set
plot_history(history)

NameError: name 'vocab_size' is not defined