In [1]:
import numpy as np
import pandas as pd
import re
import os
import plotly.graph_objects as go

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

torch.manual_seed(103)
torch.cuda.manual_seed(103)
np.random.seed(103)

deviceCount = torch.cuda.device_count()
print(deviceCount)

cuda0 = None
if deviceCount > 0:
    print(torch.cuda.get_device_name(0))
    cuda0 = torch.device('cuda:0')

In [2]:
df = pd.read_csv('../input/history-of-philosophy/philosophy_data.csv')
df['y_expected'] = df['tokenized_txt']      # ensures that the new column can store a list datatype

drop_list = []
for i in range(df.shape[0]):
    author = df.at[i,'author']
    if author=='Aristotle':
        df.at[i,'y_expected'] = torch.tensor([1,0,0,0,0])
    elif author=='Plato':
        df.at[i,'y_expected'] = torch.tensor([0,1,0,0,0])
    elif author=='Hegel':
        df.at[i,'y_expected'] = torch.tensor([0,0,1,0,0])
    elif author=='Foucault':
        df.at[i,'y_expected'] = torch.tensor([0,0,0,1,0])
    elif author=='Heidegger':
        df.at[i,'y_expected'] = torch.tensor([0,0,0,0,1])
    else:
        drop_list.append(i) 
        
df = df.drop(drop_list)

In [None]:
# clean and tokenize each text entry

def clean_text(text):
    
    # lower case characters only
    text = text.lower() 
    
    # remove urls
    text = re.sub('http\S+', ' ', text)
    
    # only alphabets, spaces and apostrophes 
    text = re.sub("[^a-z' ]+", ' ', text)
    
    # remove all apostrophes which are not used in word contractions
    text = ' ' + text + ' '
    text = re.sub("[^a-z]'|'[^a-z]", ' ', text)
    
    return text.split()


df['tokenized_txt'] = df['sentence_str'].apply(lambda x: clean_text(x))

In [None]:
# remove entries still longer than 300 words

entry_len = 300

drop_list = []
for i in range(df.shape[0]):
    if i in df and len(df.at[i,'tokenized_txt']) >= entry_len:
        drop_list.append(i)
        
df = df.drop(drop_list).reset_index(drop = True)

In [None]:
# remove entries still longer than 100 words

entry_len = 100

drop_list = []
for i in range(df.shape[0]):
    if len(df.at[i,'tokenized_txt']) >= entry_len:
        drop_list.append(i)
        
df = df.drop(drop_list).reset_index(drop = True)

In [None]:
count = 0
for i in df['tokenized_txt']:
    if len(i) >100:
        count += 1
        
print(count)

In [None]:
# load GloVe embeddings

text_embed_dim = 100                                        # changes the embedding dimension used
glove_file = '../input/glove6b/glove.6B.{}d.txt'.format(text_embed_dim)    # defines the GloVe file path -- change if using a new encoding dataset

embed_dict = {}
with open(glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embed_dict[word] = coefs
        
print("Found %s word vectors." % len(embed_dict))

In [None]:
# select the first 2,000 entries for each of our philosophers
num_entries = 2000

author_names = list(set(df['author']))
df_list = [df[df['author'] == author][:num_entries].reset_index(drop=True) for author in author_names]
df = pd.concat(df_list).reset_index(drop = True)

In [None]:
entry_len = 100

# replaces strings in an input sentence with word vectors
def embeddings(sent):
    
    # embedding includes an internal padding step
    padding_vec = np.zeros(text_embed_dim)
    out_list = [padding_vec] * entry_len
    
    # no padding step
    #out_list = []
    
    for j in range (0, len(sent)):
        if sent[j] in embed_dict:
            out_list[j] = embed_dict[sent[j]]
                     
    return(torch.tensor(out_list))

In [None]:
# this step applies embedding and padding to all tokens
tokens_list = [embeddings(df['tokenized_txt'].values[i]) for i in range (num_entries*5)]

In [None]:
# this is just list storage of labels for the same data 
identity_list = [df['y_expected'].values[i] for i in range (5*num_entries)]

In [None]:
# shuffle dataset
import random as rand

# zip to a combined reference
to_shuffle = list(zip(tokens_list, identity_list))

# shuffle combined reference
rand.shuffle(to_shuffle)

# unzip combined reference
tokens_list, identity_list = zip(*to_shuffle)

In [None]:
# expand most common contractions in text entries

contractions  = { "i'm" : "i am", "it's" : "it is", "don't" : "do not", "can't" : "cannot", 
                  "you're" : "you are", "that's" : "that is", "we're" : "we are", "i've" : "i have", 
                  "he's" : "he is", "there's" : "there is", "i'll" : "i will", "i'd" : "i would", 
                  "doesn't" : "does not", "what's" : "what is", "didn't" : "did not", 
                  "wasn't" : "was not", "hasn't" : "has not", "they're" : "they are", 
                  "let's" : "let us", "she's" : "she is", "isn't" : "is not", "ain't" : "not", 
                  "aren't" : "are not", "haven't" : "have not", "you'll" : "you will", 
                  "we've" : "we have", "you've" : "you have", "y'all" : "you all", 
                  "weren't" : "were not", "couldn't" : "could not", "would've" : "would have", 
                  "they've" : "they have", "they'll" : "they will", "you'd" : "you would", 
                  "they'd" : "they would", "it'll" : "it will", "where's" : "where is", 
                  "we'll" : "we will", "we'd" : "we would", "he'll" : "he will", "shouldn't" : "should not", 
                  "wouldn't" : "would not", "won't" : "will not" }


def expand_contractions(words):
    
    for i in range(len(words)):
        if words[i] in contractions:
            words[i] = contractions[words[i]]
            
    return (' '.join(words)).split()


# precautionary cleaning for any remaing apostrophes
def remove_apostrophes(words):
    words = ' '.join(words)
    words = re.sub("'", '', words)
    return words.split()


df['tokenized_txt'] = df['tokenized_txt'].apply(lambda words: expand_contractions(words))
df['tokenized_txt'] = df['tokenized_txt'].apply(lambda words: remove_apostrophes(words))

In [None]:
# remove most frequent stop words

stop_words = [ 'the', 'a', 'in', 'to', 'of', 'i', 'and', 'is', 'you', 'for', 'on', 'it', 'my', 'that',
               'with', 'are', 'at', 'by', 'this', 'have', 'from', 'be', 'was', 'do', 'will', 'as', 'up', 
               'me', 'am', 'so', 'we', 'your', 'has', 'when', 'an', 's', 'they', 'about', 'been', 'there',
               'who', 'would', 'into', 'his', 'them', 'did', 'w', 'their', 'm', 'its', 'does', 'where', 'th',
               'b', 'd', 'x', 'p', 'o', 'r', 'c', 'n', 'e', 'g', 'v', 'k', 'l', 'f', 'j', 'z', 'us', 'our',
               'all', 'can', 'may' ] 

def remove_stop_words(words):
    result = []
    for word in words:
        if not (word in stop_words):
            result.append(word)
    return result

# df['tokenized_txt'] = df['tokenized_txt'].apply(lambda words: remove_stop_words(words))

In [None]:
# count words not in GloVe embeddings

unknown_words = []
total_words = 0

def find_unknown_words(words):
    
    global total_words
    total_words = total_words + len(words)
    
    for word in words:
        if not (word in embed_dict):
            unknown_words.append(word)
    
    return words


df['tokenized_txt'].apply(lambda words: find_unknown_words(words))

print( f'{len(unknown_words)/total_words*100:5.2} % of words are unknown' )

In [None]:
# analyze and create a table of remaining unknown words

def analyze_unknown_words(unknown_words):
    
    unknown_words = np.array(unknown_words)
    (word, count) = np.unique(unknown_words, return_counts=True)
    
    word_freq = pd.DataFrame({'word': word, 'count': count}).sort_values('count', ascending=False)

    fig = go.Figure(data=[go.Table(
          header=dict(values=list(word_freq.columns),
                    fill_color='paleturquoise',
                    align='left'),
          cells=dict(values=[word_freq['word'], word_freq['count']],
                    fill_color='lavender',
                    align='left'))
          ])
    fig.update_layout(width=300, height=300, margin={'b':0, 'l':0, 'r':0, 't':0, 'pad':0})
    fig.show()
        
analyze_unknown_words(unknown_words)

In [None]:
# MLP definition

class MLPmodel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.softmax = nn.Softmax(dim=0)
        
        self.fc1 = nn.Linear(100, 100, False)
        self.fc2 = nn.Linear(100, 200, False)
        self.fc3 = nn.Linear(200, 5, False)
        
        self.fc4 = nn.Linear(500, 5, False)
        
    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = x.flatten()
        x = F.relu(self.fc4(x))
        return self.softmax(x)

In [None]:
def validate(X_test, Y_test):
    correct = 0
    accuracy = 0
    size = 0
    for i in range(len(X_test)):
        
        pred = mlp_model(X_test[i].type(torch.FloatTensor))
        current_label = Y_test[i]
        
        maxIdx = 0
        for i in range(5):
            if pred[i] > pred[maxIdx]:
                maxIdx = i

        if current_label[maxIdx] == 1:
            correct += 1
        
        size += 1
        
    accuracy = correct / size
    
    return accuracy

In [None]:
lr = 1e-4

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=lr)

output = [[],[]]
author_list = [tokens_list, identity_list]

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(tokens_list, identity_list, test_size=0.2)
train_list = [X_train, Y_train]
test_list = [X_test, Y_test]

In [None]:
num_epochs = 30
batch_size = 20
current_batch = 0
run_number = 0

validation_loss = []
validation_accuracy = []
train_loss = []
train_accuracy = []

mlp_model = MLPmodel()

lr = 1e-4
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=lr)

for epoch in range(num_epochs):
    current_loss = 0
    
    train_correct = 0
    samples = 0
    
    for i in range(batch_size):
        sample_index = (i + epoch*batch_size) % len(train_list)
        
        pred = mlp_model(train_list[0][sample_index].type(torch.FloatTensor))
        current_loss += loss_fn(pred, train_list[1][i].type(torch.FloatTensor))
        run_number += 1
    
    train_loss.append(current_loss)
    
    optimizer.zero_grad()
    current_loss.backward()
    optimizer.step()
    
    val_accuracy = validate(X_test, Y_test)
    validation_accuracy.append(val_accuracy)

    train_accuracy.append(validate(X_train, Y_train))
    train_loss.append(current_loss)
    
    if (epoch % 5 == 0):
        print("Epoch: {}".format(epoch+1))
        print(f'Train Loss: {current_loss}')
        print(f'Train Accuracy: {train_accuracy[epoch]}')
        print(f'Validation Accuracy: {validation_accuracy[epoch]}')
        print('')

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_accuracy, label = 'Training Accuracy')
plt.plot(validation_accuracy, label = "Validation Accuracy")
plt.xlabel("Number of Epochs")
plt.ylabel("Accuracy")
plt.title("Multi-layer Perceptron Accuracy over Time")
plt.legend()
plt.show()