In [45]:
# Python 3 notebook for neural network
import numpy as np
import math
import statistics
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from keras.utils import to_categorical
from keras.utils import plot_model
import IPython.display
import matplotlib.pyplot as plt

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

import importlib
import sys
import pickle
import torch
from torch.autograd import Variable
import torch.nn as nn
from random import shuffle

# https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
we = importlib.import_module("word_embeddings")
import re

In [57]:
""" Global Parameters """
# Load some data
wordEmbedDict = we.getWordEmbeddingDict() # Load the dictionary

# Labels
top_20 = sorted(['AskReddit', 'leagueoflegends', 'nba', 'funny', 'pics', 'nfl', 'pcmasterrace', \
          'videos', 'news', 'todayilearned', 'DestinyTheGame', 'worldnews', 'soccer', \
          'DotA2', 'AdviceAnimals', 'WTF', 'GlobalOffensive', 'hockey', 'movies', 'SquaredCircle'])

# Indices of our desired data
TRUE_LABEL = 8 # Index of the true label, hard coded
BODY_INDEX = 17 # Index of the reddit comment, hard coded

# Neural Network Parameters
NUM_SUBREDDITS = len(top_20)
NUM_FEATURES = 300 # length returned from embedding
NUM_EXAMPLES = 15000 # Arbitrary, choose however many we want to grab from the dataset
NUM_EPOCHS = 100
NUM_HIDDEN_NEURONS = 3
NUM_LAYERS = 3
LEARNING_RATE = 0.0001

SUBREDDIT = "leagueoflegends" # used for debugging,delete later
unparsed = "./data/condensed_dataset.pkl" # will change this so we can just call the whole pkl set

# Encoder
encoder = LabelEncoder()
encoder.fit(top_20) # Encodes the NUM_SUBREDDITS subreddits

# Seed
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
""" """

' '

In [77]:
'''Helper Functions'''

# Normalization of dataset

# Get mean of a column feature
def getMean(column):
    sum = 0
    n = len(column)
    for i in range(n):
        sum += column.iloc[i]
    mean = sum / float(n)
    return mean

def getVariance(column, mean):
    squareMeanSum = 0
    n = len(column)
    for i in range(n):
        squareMeanSum += (column.iloc[i] - mean)**2
    var = math.sqrt(squareMeanSum / float(n))
    return var

def normalizeSet(set):
    numRow = len(set.index)
    numCol = len(set.columns)
    for col in range(numCol):
        column = set.iloc[:,col]
        mean = getMean(column)
        var = getVariance(column, mean)
        
        for row in range(numRow):
            set.iloc[row, col] = float(set.iloc[row, col] - mean) / var
    return set

# @param dir: string, directory of pickle data
# @return dataset: unpickled dataset
def loadPickleData(dir):
    with open(dir, 'rb') as f:
        dataset = pickle.load(f)
    return dataset

# Returns [X, Y] with m examples
def loadData(pickleDir, m):
    pickle = loadPickleData(pickleDir)
    return vectorizeDataSet(pickle, m)

def stripNonAlpha(word):
    word = re.sub(r'\W+', '', word)
    return word

def vectorizeWord(word):
    word = stripNonAlpha(word)
    keyset = wordEmbedDict.keys() # words in the dictionary
    zeroVec = np.zeros((1, NUM_FEATURES))
    vWord = pd.DataFrame(zeroVec)
    
    if word in keyset:
        vWord = pd.DataFrame(wordEmbedDict[word]).transpose()
    return vWord # returns zero vector if the word is not in the dictionary

def vectorizeComment(body):
#     print(body)
    vComment = np.zeros((1, NUM_FEATURES))
    vComment = pd.DataFrame(vComment)
    words = body.split()
#     print(vComment)
    
    numWords = 0
    for word in words:
        vWord = vectorizeWord(word)
        numWords += 1
        vComment = vComment + vWord
    vComNP = vComment.values
    vComScaled = vComNP * (1/float(numWords))
    vComScaled = pd.DataFrame(vComScaled)
    return vComScaled

# Encodes a subreddit string into an unrolled one-hot pandas vector
def oneHotEncode(subreddit):
    #   https://machinelearningmastery.com/multi-class-classification-tutorial-keras-deep-learning-library/
    encoded_Y = encoder.transform([subreddit])[0] # get the integer category
    
    oneHot = [0 for _ in range(NUM_SUBREDDITS)]
    oneHot[encoded_Y] = 1
    pandasOneHot = pd.DataFrame(oneHot)
    
    return pandasOneHot.transpose()
    
# Returns a squished array from 3D data 
# entries are examples, not comments (body)
def to2D(data):
    data_2D = [example for i in range(NUM_SUBREDDITS) for example in data[i]]
    df = pd.DataFrame(data_2D)
    return df
    
    
#TODO: check if seed is bad?? just set a new random seed
#TODO: limit number of features 
def vectorizeDataSet(data, m):
    data = pd.DataFrame(data)
    data = data.sample(frac=1, random_state=88).reset_index(drop=True) # Shuffles data
    
    data_2D = to2D(data)
    
    comments = data_2D.pop(BODY_INDEX)
#     print(len(comments))
    true_labels = data_2D.pop(TRUE_LABEL)
    
    unrollComment = comments[0]
    print("unrollcomment: \n", unrollComment)
    print(type(unrollComment))
    X = vectorizeComment(unrollComment)
    print(X)
    firstSubreddit = true_labels[0]
    Y = oneHotEncode(firstSubreddit)

    # For each example in old data set, get the actual comment and featurize it into X
    # Also get unrolled true label
    for i in range(1, m):
        comment = comments[i]
        example = vectorizeComment(comment)
        subreddit = true_labels[i]
        oneHot = oneHotEncode(subreddit)
        
        X = pd.concat([X, example])
        Y = pd.concat([Y, oneHot])
        
    X_scaled = preprocessing.StandardScaler().fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled)
    X_scaled_df.reset_index(drop=True, inplace=True)
    Y.reset_index(drop=True, inplace=True)
    concat = pd.concat([X_scaled_df, Y], axis=1)
    return concat

In [39]:
# Debugging
data = loadPickleData(unparsed)
print(data[0][0])

[1430438400, 3, 't5_2qh1i', 't3_34f9rh', 't1_cqug90j', 0, None, None, 'AskReddit', 'cqug90j', None, 0, 0, 0, 'jesse9o3', 3, 1432703079, "No one has a European accent either  because it doesn't exist. There are accents from Europe but not a European accent.", None, 0, 0, 't1_cqug2sr', 0]


In [64]:
# Debugging
# print(data[2][0])
# print(len(data[0]))

data_2D = to2D(data)
# print(data_2D[0])
df = pd.DataFrame(data_2D)
# df.tail()
comments = df.pop(BODY_INDEX)
print(len(comments))

1000000


In [59]:
# Split dataset
#https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test/38251213#38251213
def train_validate_test_split(df, train_percent=.9, validate_percent=.05, seed=seed):
    m = len(df.index)
    
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[:train_end]
    validate = df.iloc[train_end:validate_end]
    test = df.loc[validate_end:]
    return train, validate, test

In [78]:
%%time
df = loadData(unparsed, NUM_EXAMPLES)
train, validate, test = train_validate_test_split(df)

train_labels = train.iloc[:, NUM_FEATURES:]
validate_labels = validate.iloc[:, NUM_FEATURES:]
test_labels = test.iloc[:, NUM_FEATURES:]

train = train.iloc[:, :NUM_FEATURES]
validate = validate.iloc[:, :NUM_FEATURES]
test = test.iloc[:, :NUM_FEATURES]

unrollcomment: 
 Yeah, but [60% boards,](http://i.imgur.com/gKoR38B.png) and the like, have layouts that are of normal size, just very minimal in all other aspects. What I mean is, all the keys are about the same size, and about the same distance from each other as larger, more regular keyboards; they just have fewer keys and lower profile casings etc.

This keyboard looks like all the keys are smaller, and not spaced out to compensate for the size, meaning the whole layout is very compressed. Must be intended for mobile/portable use.
<class 'str'>
        0         1         2         3         4         5         6    \
0  0.032985  0.008206 -0.114413 -0.188197  0.103022 -0.049788 -3.186192   

        7         8         9      ...          290       291      292  \
0  0.259812  0.018643 -0.341993    ...     0.031002 -0.119123 -0.03924   

        293       294      295       296      297       298       299  
0 -0.011097  0.021538  0.04608 -0.080472 -0.13224  0.058556  0.079446  



KeyError: 400

In [7]:
%%time
# # Save the dataframe as a pickle file to be read later
# df.to_pickle("./data/pandas-pickle-small.pkl")

# check if potentially overwriting data with the same feature everytime
unpickled = pd.read_pickle("./data/pandas-pickle-small.pkl")
unpickled.tail()

CPU times: user 0 ns, sys: 93.8 ms, total: 93.8 ms
Wall time: 569 ms


In [8]:
%%time
# Check softmax output (final layer output)
# Check gradients inside the keras model built in function
# Neural network function
def build_nn(hidden_layer_sizes):
    model = keras.Sequential()
    
    # Input layer
    model.add(layers.Dense(hidden_layer_sizes[0], input_dim=NUM_FEATURES, activation='relu'))

    # Conv1d Layer to fix high variance
#     model.add(layers.Conv1D(kernel_size = (10),strides=10,filters=2, input_shape=(hidden_layer_sizes[0],NUM_FEATURES),kernel_initializer= 'uniform',activation='relu'))
    
#      https://datascience.stackexchange.com/questions/19407/keras-built-in-multi-layer-shortcut
#     Hidden layers
    for size in hidden_layer_sizes[1:]:
        model.add(layers.Dense(size, activation='relu', kernel_initializer='random_uniform', use_bias=True, activity_regularizer=regulizers.l1(0.01)))
        
    # Fixes somethings for some reason
#     model.add(layers.Flatten())
    
    # Output layer
    model.add(layers.Dense(NUM_SUBREDDITS, activation='softmax'))
    
    # Optimizer. Can change this to whatever we want
    optimizer = tf.keras.optimizers.Adam(lr=LEARNING_RATE)

    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', 'categorical_crossentropy'])
    return model

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10.7 µs


In [9]:
%%time
# Build the model
# Features might be an issue, pick different subreddits
# potentially try out CNN with matrix word embedding without combined word embedding
sizes_list = [NUM_HIDDEN_NEURONS for i in range(NUM_LAYERS)]
nn_model = build_nn(sizes_list)
nn_model.summary()

Instructions for updating:
Colocations handled automatically by placer.


NameError: name 'regulizers' is not defined

In [10]:
%%time
# Get neural network history
# Change batch size (inspect what happens with each ITERATION, not EPOCH)
# History is the progress of our neural network, will be used to plot cost functions
nn_history = nn_model.fit(train, train_labels, epochs=NUM_EPOCHS, verbose=2,
         validation_data=(validate, validate_labels))

NameError: name 'nn_model' is not defined

In [11]:
# Plot metrics
def plot_history(history):
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    plt.figure()
    plt.xlabel('Epoch')
    plt.ylabel('Cross Entropy Error')
    plt.plot(hist['epoch'], hist['categorical_crossentropy'],
           label='Train Error')
    plt.plot(hist['epoch'], hist['val_categorical_crossentropy'],
           label = 'Val Error')
    plt.ylim([1,4])
    plt.legend()
    plt.show()
    
plot_history(nn_history)

NameError: name 'nn_history' is not defined

In [None]:
# See Training Accuracy
hist = pd.DataFrame(nn_history.history)
accuracy_vec = hist.pop("acc")
finalAcc = accuracy_vec[len(accuracy_vec) - 1]
print("Final accuracy: {}%".format(finalAcc*100))

In [None]:
# Expects list
def accuracy(predictions, true_label):
    count = 0
    total = len(predictions)
    for i in range(total):
        if predictions[i] == true_label[i]:
            count += 1
    return float(count) / total

In [None]:
# Test accuracy:
y_prob = nn_model.predict(test)
y_classes = y_prob.argmax(axis=-1)
# print(y_classes)
# print(test_labels)
accuracy(y_classes, test_labels)

In [None]:
%%time
# Logistic Regression Model
df2 = pd.read_pickle("./data/pandas-pickle-small.pkl")
df2 = df2.sample(frac=1, random_state=2).reset_index(drop=True) # Shuffles data
# # print("This is our X: ", df2.iloc[:, ]
numTrain = 9000
numTest = 1000

X = df2.iloc[0:numTrain, :NUM_FEATURES]
X_array = X.values

y = df2.iloc[0:numTrain, NUM_FEATURES:]
y_array = y.values

y_array_rows = y_array.shape[0]

y_integer_classes = np.array([])
for i in range(y_array_rows):
    row = y_array[i]
    index = np.where(row==1)[0][0]
    y_integer_classes = np.append(y_integer_classes, int(index))

testX = df2.iloc[numTrain:numTrain+numTest, :NUM_FEATURES]
testX_array = testX.values
testY = df2.iloc[numTrain:numTrain+numTest, NUM_FEATURES:]
testY_array = testY.values

testy_array_rows = testY_array.shape[0]
testy_integer_classes = np.array([])
for i in range(testy_array_rows):
    row = testY_array[i]
    index = np.where(row==1)[0][0]
    testy_integer_classes = np.append(testy_integer_classes, int(index))

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
n_estimators = 10
print("Prediction results: \n")
predictions = OneVsRestClassifier(BaggingClassifier(LinearSVC(random_state=0, max_iter=1000), n_estimators = n_estimators)).fit(X_array, y_integer_classes).predict(testX_array)
print(predictions)
print("True results: \n")
print(testy_integer_classes)


# Expects list
def accuracy(predictions, true_label):
    count = 0
    total = len(predictions)
    for i in range(total):
        if predictions[i] == true_label[i]:
            count += 1
    return float(count) / total

pred_list = predictions.tolist()
testy_list = testy_integer_classes.tolist()
print("SVM accuracy: {}%".format(accuracy(pred_list, testy_list)*100))
        