## MLP Testing

First thing that is to be done is to import the data and generate splits

In [38]:
# Build the network
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import initializers
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

# Set random seed
np.random.seed(0)

def sigmoid(z):
    return 1 / (1 + tf.exp(-z))

def classifier_model(layer_size, num_layers, activation, input_dim, dropout_rate=0.7):
    network = models.Sequential()

    # Add first layer
    network.add(layers.Dense(layer_size, 
                             input_dim=input_dim, 
                             activation=activation, 
                             kernel_initializer=initializers.RandomNormal(stddev=0.01),
                             kernel_regularizer=regularizers.l2(0.01)))

    for _ in range(num_layers):
        network.add(layers.Dense(layer_size, 
                                 activation=activation, 
                                 kernel_initializer=initializers.RandomNormal(stddev=0.01),
                                 kernel_regularizer=regularizers.l2(0.01)))
        network.add(BatchNormalization())
        network.add(layers.Dropout(dropout_rate))
    network.add(layers.Dense(4, activation='softmax')) # Add the output layer

    # Compile the network
    network.compile(optimizer='rmsprop',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
    
    return network

model_1 = classifier_model(
    layer_size=300,
    num_layers=5, 
    input_dim=3000,
    activation=sigmoid,
    )

In [39]:
import sys
sys.path.append('../')

from prep import *
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

# Load data
df = pd.read_csv('../../Datasets/train.csv', encoding='cp1252')

# Do some basic cleaning
df = prep_data(df)

# Split data using KFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Make list to store accuracies
test_acc = []

with tf.device('/gpu:0'):
    # Iterate through folds
    for train_index, val_index in kf.split(df['body'], df['subreddit']):
        # Split data
        train = df.iloc[train_index]
        val = df.iloc[val_index]

        # Reduce features based on mutual information
        subreddits = ['Toronto', 'London', 'Paris', 'Montreal']
        train, _ = remove_common_words(train, subreddits, 300)
        train = mutual_info_transform(train, 3250)
        train, vocab = remove_common_words(train, subreddits, 25)

        # Remove words not in vocab from val
        val['body'] = val['body'].apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

        # Split into X and y
        X_train = train['body']
        y_train = train['subreddit']
        y_train = y_train.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})
        y_train = to_categorical(y_train)
        X_val = val['body']
        y_val = val['subreddit']
        y_val = y_val.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})
        y_val = to_categorical(y_val)

        # Vectorize data
        vectorizer = TfidfVectorizer(max_features=3000)
        X_train = vectorizer.fit_transform(X_train)
        X_val = vectorizer.transform(X_val)

        # Train model
        X_train = X_train.toarray()
        model_1.fit(X_train, y_train, epochs=10, batch_size=128)

        # Evaluate on validation set
        X_val = X_val.toarray()
        _, acc = model_1.evaluate(X_val, y_val)
        test_acc.append(acc)

        # Print accuracy
        print(f'Accuracy: {test_acc[-1]}')

    test_acc = np.mean(test_acc)
    print(f'Average accuracy: {test_acc}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.25
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.25
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.25
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.25
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.251748263835907
Average accuracy: 0.2503496527671814


In [43]:
# Retrain on full dataset and test on Kaggle test set
kaggle_test = pd.read_csv('../../Datasets/Kaggle/test.csv', encoding='cp1252')
test_body = kaggle_test['body'].copy()
kaggle_test = prep_data(kaggle_test)
# kaggle_test = word_replacement(kaggle_test)

test_df = df.copy()

# Reduce features based on mutual information
subreddits = ['Toronto', 'London', 'Paris', 'Montreal']
# test_df = word_replacement(test_df)
# test_df, _ = remove_common_words(test_df, subreddits, 300)
# test_df = mutual_info_transform(test_df, 3250)
# test_df, vocab = remove_common_words(test_df, subreddits, 25)

# Remove words not in vocab from kaggle test set
kaggle_test['body'] = kaggle_test['body'].apply(lambda x: ' '.join([word for word in x.split() if word in vocab]))

# Split into X and y
X_train = test_df['body']
y_train = test_df['subreddit']
y_train = y_train.map({'Toronto': 0, 'London': 1, 'Paris': 2, 'Montreal': 3})
y_train = to_categorical(y_train)
print(y_train)

# Vectorize data
vectorizer = TfidfVectorizer(max_features=3000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(kaggle_test['body'])

# Train model
X_train = X_train.toarray()
X_test = X_test.toarray()
model_1.fit(X_train, y_train, epochs=3, batch_size=16)

# Make predictions on test set
y_pred = model_1.predict(X_train)

# Make kaggle test answer array
# First 70 are 0, next 70 are 1, etc.
kaggle_ans = np.zeros(280)
for i in range(4):
    kaggle_ans[i*70:(i+1)*70] = i
# remove last value
kaggle_ans = kaggle_ans[:-1]

# Convert predictions to labels
y_pred = np.argmax(y_pred, axis=1)

print(y_pred)

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]
Epoch 1/3
Epoch 2/3
Epoch 3/3
[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 