<a href="https://colab.research.google.com/github/faizasheraz/AuthorCategorization/blob/main/CommentAuthorIdentification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Classification of authors of reddit comments**


In [1]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#
# Data Cleaning
#----------------

import re
import statistics
import sys

file_name = "/content/drive/My Drive/workspace/ColabNotebooks/top_author_comments.csv"

author_subreddits_comments_df = pd.read_csv(file_name)
print(author_subreddits_comments_df.head(10))

author_list = set(author_subreddits_comments_df["author"])

#REMOVING 
#URLs (with slashes, .coms, www.)
# text like <f0><U+361 .. > # example : <f0><U+009>
#non alphanumeric characters 

author_subreddits_comments_df["body"] = author_subreddits_comments_df["body"].str.replace(r"/r/\.*\S+", " ")
author_subreddits_comments_df["body"] = author_subreddits_comments_df["body"].str.replace(r"https?:/\/\.*\S+", " ")
author_subreddits_comments_df["body"] = author_subreddits_comments_df["body"].str.replace(r"(<f0>)?<U\+\S+", " ")
author_subreddits_comments_df["body"] = author_subreddits_comments_df["body"].str.replace(r"[^a-zA-Z\d\s']", " ")

#removing rows with duplicate comments
author_subreddits_comments_df.drop_duplicates(subset="body", keep="first", inplace=True)
author_subreddits_comments_df.sort_values("author", inplace=True)


   Unnamed: 0         author        subreddit  \
0      278137  ThisIs_MyName  ProgrammerHumor   
1      277405  ThisIs_MyName        Economics   
2      277404  ThisIs_MyName      programming   
3      277397  ThisIs_MyName   instant_regret   
4      277396  ThisIs_MyName        Economics   
5      277395  ThisIs_MyName  ProgrammerHumor   
6      277394  ThisIs_MyName      programming   
7      277412  ThisIs_MyName      programming   
8      277410  ThisIs_MyName      programming   
9      277393  ThisIs_MyName            anime   

                                                body  
0  Works sometimes... Try `System.out.printf("u n...  
1                               the `/s` is implicit  
2                     Are you taking about Postgres?  
3                                                Aww  
4  Not sure what you're asking. US-based companie...  
5                        How can structs have holes?  
6  Interesting, what a weird implementation of fo...  
7  It changes the ha



In [3]:
#Selecting the same number of comments for the model
# Balance of comments of different authors
#-------------------------------------------------------------------------------------------------

min_comments = len(author_subreddits_comments_df[author_subreddits_comments_df["author"] == list(author_list)[0]]) 
author_dfs = []
for author in author_list:
  df = author_subreddits_comments_df[author_subreddits_comments_df["author"] == author]
  author_dfs.append(df)
  if(len(df) < min_comments):
    min_comments = len(df)
print("minimum_comments" , min_comments)

# For now taking only equal amount of comments for each author which is the minimum comments by an author
author_dfs = [df.head(min_comments) for df in author_dfs]
author_subreddits_comments_df = pd.concat(author_dfs)



minimum_comments 2885


In [4]:
#
#  Shuffling dataset and converting authors(labels) to integar form for the 
#  neural network
#----------------------------------------------------------------------------------

#shuffling entries of dataframe so that comments of one author are not together as a group
author_subreddits_comments_df = author_subreddits_comments_df.sample(frac=1).reset_index(drop=True)

#getting authors corresponding to comments of authors
comment_authors_list = author_subreddits_comments_df["author"]
print(comment_authors_list[0:10])

# map authors of comments(our labels) to integars
dict_all_authors = dict(zip(author_list, list(range(0,len(author_list)))))
print(dict_all_authors)

int_comment_authors_list = [dict_all_authors[author] for author in comment_authors_list]
print(int_comment_authors_list[0:10])


0       meatduck12
1         awhaling
2         awhaling
3    ThisIs_MyName
4    ThisIs_MyName
5    ThisIs_MyName
6         awhaling
7       meatduck12
8    ThisIs_MyName
9       meatduck12
Name: author, dtype: object
{'meatduck12': 0, 'ThisIs_MyName': 1, 'awhaling': 2}
[0, 2, 2, 1, 1, 1, 2, 0, 1, 0]


In [5]:
#  Converting input data (comments) to integars for the neural network
#
#---------------------------------------------------------------------------

import statistics
import sys 

#tokenize comments text into words
comments_list = author_subreddits_comments_df["body"]
tokenized_comments_list = [comment.split() for comment in comments_list] 

#print(comments_list[0:3])
#print(tokenized_comments_list[0:3])

# Get words vocabury from comments and map to integars 
all_words = []
for comment in tokenized_comments_list:
  all_words += comment
all_words = set(all_words)

dict_all_words = dict(zip(all_words, list(range(1, len(all_words)+1))))
print(list(dict_all_words)[:10])

#get integars list of tokenized comments
int_tokenized_comments_list = [[dict_all_words[word] for word in comment] for comment in tokenized_comments_list]
print(tokenized_comments_list[0:10])
print(int_tokenized_comments_list[0:10])


['incentivized', 'TH', 'complications', 'tossing', 'initial', 'soldiers', 'schools', 'showing', '24', 'Racoon']
[['If', 'the', 'federal', 'government', 'can', 'regulate', 'civil', 'rights', 'they', 'should', 'be', 'able', 'to', 'regulate', 'this', 'shit', 'too', 'This', 'is', 'getting', 'absurd'], ["That's", 'not', 'remotely', 'funny'], ['The', 'only', 'time', 'it', 'made', 'sense', 'was', 'when', 'the', 'uprising', 'event', 'game', 'out', 'and', 'I', 'guess', 'if', 'you', 'wanted', 'to', 'see', 'it', 'up', 'close', 'But', 'yeah', 'how', 'stupid', 'is', 'that'], ['I', 'call', 'BS'], ["You're", 'talking', 'about', 'pinch', 'collars', 'that', 'stop', 'dogs', 'from', 'pulling', 'the', 'leash', 'and', 'strangling', 'themselves', 'The', 'guy', "you're", 'replying', 'to', 'is', 'talking', 'about', 'spiked', 'collars', 'that', 'protect', 'the', 'animal', 'from', 'predators'], ['Yes', 'use', 'the', 'rotate', 'buttons', 'on', 'the', 'top', 'left', "I'm", 'pretty', 'sure', 'there', 'is', 'a', 'h

In [6]:
#
# Getting an idea of average comment lengths to get rid of comments that are too
# long or too short as compared to mean and median
#
#-----------------------------------------------------------------------------

comments_words_len = [len(tokenized_comment) for tokenized_comment in tokenized_comments_list]
mean = statistics.mean(comments_words_len)
median = statistics.median(comments_words_len)
print("mean: ", mean)
print("median: ", median)
print("max", max(comments_words_len))

count = 0
for length in comments_words_len:
  if length > 100:
    count = (count+1)
print(count)

#selecting 100 as seq_len based on above data
seq_len = 100


mean:  23.344425187752744
median:  14
max 1055
223


In [7]:
# Removing zero length msgs and truncating/padding comments with zeros to get
# even length comments
#-----------------------------------------------------------------------------

# removing zero length integar comments and their corresponding authors
zero_length_comment_indices = []

for comment_index in range(len(int_tokenized_comments_list)):
  if (len(int_tokenized_comments_list[comment_index]))==0:
    zero_length_comment_indices.append(comment_index)

print(zero_length_comment_indices)
print(len(int_tokenized_comments_list))
print(len(int_comment_authors_list))

#sorting zero length comments in descending order so that poping elements does
#not change indices of other zero length comments
zero_length_comment_indices.sort(reverse=True)

for comment_index in zero_length_comment_indices:
  print("comment popped", int_tokenized_comments_list.pop(comment_index))
  print("author popped", int_comment_authors_list.pop(comment_index))

print(len(int_tokenized_comments_list))
print(len(int_comment_authors_list))

# todo: change name of variable from start
target_labels = int_comment_authors_list

padded_features = []

for comment in int_tokenized_comments_list:
  padded_comment = [0]*seq_len
  if len(comment)<=seq_len:
    padded_comment[seq_len-len(comment):] = comment[:]
  else:
    padded_comment[:] = comment[:seq_len]
  
  padded_features.append(padded_comment)

print("padded features ", padded_features[0:10])

[1349, 1409, 2236, 3336, 7985]
8655
8655
comment popped []
author popped 1
comment popped []
author popped 1
comment popped []
author popped 1
comment popped []
author popped 1
comment popped []
author popped 1
8650
8650
padded features  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14088, 10241, 14569, 11818, 5539, 15284, 7977, 13186, 12931, 15891, 15081, 13329, 14820, 15284, 1725, 7434, 10383, 2708, 3138, 661, 1003], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5602, 1288, 3208, 901], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [8]:
# Divide into test, train and validate sets
# and covert to numpy arrays
#-------------------------------------------------
import numpy

split_frac = 0.8
split_indx = int(split_frac*len(padded_features))

train_x, remaining_x = numpy.array(padded_features[:split_indx]), padded_features[split_indx:]
train_y, remaining_y = numpy.array(target_labels[:split_indx]), target_labels[split_indx:]

test_val_frac = 0.5
test_val_indx = int(test_val_frac*len(remaining_x))

test_x, vald_x = numpy.array(remaining_x[:test_val_indx]), numpy.array(remaining_x[:test_val_indx])
test_y, vald_y = numpy.array(remaining_y[:test_val_indx]), numpy.array(remaining_y[:test_val_indx])

print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)
print(vald_x.shape, vald_y.shape)


(6920, 100) (6920,)
(865, 100) (865,)
(865, 100) (865,)


In [10]:
# creating tensor dataloaders 
#
#########################################

import torch
from torch.utils.data import TensorDataset, DataLoader

train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
vald_data = TensorDataset(torch.from_numpy(vald_x), torch.from_numpy(vald_y))

batch_size = 32

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last=True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(vald_data, shuffle=True, batch_size=batch_size, drop_last=True)

dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print(sample_x.size(), sample_y.size())


torch.Size([32, 100]) torch.Size([32])


In [11]:
# creating LSTM neural network
#
######################################
import torch.nn as nn

# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

class AuthorIdentification(nn.Module):

  def __init__(self, output_size, n_layers, hidden_dim, vocab_size, embedding_dim, drop_prob=0.5):
    super(AuthorIdentification, self).__init__()

    self.output_size = output_size
    self.n_layers = n_layers
    self.hidden_dim = hidden_dim

    #embedding and lstm layers
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
                        dropout=drop_prob, batch_first=True)
    
    #drop out layer
    self.dropout = nn.Dropout(0.5)

    #linear layer
    self.fc = nn.Linear(hidden_dim, output_size)

  def forward(self, x, hidden):

    batch_size = x.size(0)
    x = x.long()
    embeds = self.embedding(x)
    lstm_out, hidden = self.lstm(embeds, hidden)
    
    lstm_out =  lstm_out[:,-1,:] #getting the last time stepout

    out = self.dropout(lstm_out)
    out = self.fc(out)

    return out, hidden

  def init_hidden(self, batch_size):
    #create two new tensors with dimension n_layer x batch_size x hidden_dim,
    #initialized to zero, for hidden state and cell state of lstm

    weight = next(self.parameters()).data

    if (train_on_gpu):
      hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
               weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
    else:
      hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
               weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())

    return hidden


In [12]:
#Instantiate the model with hyperparameters

vocab_size = len(all_words)+1 # +1 for the padded zero
output_size = 3
embedding_dim = 400
hidden_dim = 128
n_layers = 2

net = AuthorIdentification(output_size, n_layers, hidden_dim, vocab_size, embedding_dim)
print(net)

AuthorIdentification(
  (embedding): Embedding(16295, 400)
  (lstm): LSTM(400, 128, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=128, out_features=3, bias=True)
)


In [13]:
# Let the training begin!
#
########################################
import torch

#loss and optimization functions

lr = 0.00006
#criterion = nn.CrossEntropyLoss()
criterion = nn.MultiMarginLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [14]:
#training parameters

epochs = 20  #loss stops decreasing after this point

counter = 0
print_every = 100
clip = 5 #gradient clipping

if (train_on_gpu):
  net.cuda()

net.train()
for e in range(epochs):
  #initialize hidden state
  h = net.init_hidden(batch_size)

  #batch loop
  for inputs, labels in train_loader:
    counter += 1

    if (train_on_gpu):
      inputs, labels = inputs.cuda(), labels.cuda()

    #creating new variables for hidden state, otherwise we will backprop
    #through the entire training history
    h = tuple([each.data for each in h])
    
    #zero accumulated gradients
    net.zero_grad()

    #get the output from the model
    output, h = net(inputs, h)

    #calculate the loss and perform the backprop
    loss = criterion(output.squeeze(), labels)
    loss.backward()

    #clip gradient helps prevent exploding gradient problem in LSTMs and RNNs
    nn.utils.clip_grad_norm_(net.parameters(),clip)
    optimizer.step()

    # loss stats
    if counter % print_every == 0:
      # Get validation loss
      val_h = net.init_hidden(batch_size)
      val_losses = []
      net.eval()
      for inputs, labels in valid_loader:

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        val_h = tuple([each.data for each in val_h])

        if(train_on_gpu):
          inputs, labels = inputs.cuda(), labels.cuda()

        output, val_h = net(inputs, val_h)
        val_loss = criterion(output.squeeze(), labels)
        val_losses.append(val_loss.item())

      net.train()
      print("Epoch: {}/{}...".format(e+1, epochs),
            "Step: {}...".format(counter),
            "Loss: {:.6f}...".format(loss.item()),
            "Val Loss: {:.6f}...".format(numpy.mean(val_losses)))


    

  allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass


Epoch: 1/20... Step: 100... Loss: 0.669105... Val Loss: 0.655003...
Epoch: 1/20... Step: 200... Loss: 0.631179... Val Loss: 0.630441...
Epoch: 2/20... Step: 300... Loss: 0.571465... Val Loss: 0.611588...
Epoch: 2/20... Step: 400... Loss: 0.625084... Val Loss: 0.594783...
Epoch: 3/20... Step: 500... Loss: 0.593477... Val Loss: 0.568086...
Epoch: 3/20... Step: 600... Loss: 0.543577... Val Loss: 0.543356...
Epoch: 4/20... Step: 700... Loss: 0.486937... Val Loss: 0.531375...
Epoch: 4/20... Step: 800... Loss: 0.570336... Val Loss: 0.515077...
Epoch: 5/20... Step: 900... Loss: 0.422605... Val Loss: 0.498140...
Epoch: 5/20... Step: 1000... Loss: 0.494463... Val Loss: 0.478303...
Epoch: 6/20... Step: 1100... Loss: 0.398685... Val Loss: 0.466584...
Epoch: 6/20... Step: 1200... Loss: 0.592785... Val Loss: 0.457393...
Epoch: 7/20... Step: 1300... Loss: 0.497938... Val Loss: 0.444745...
Epoch: 7/20... Step: 1400... Loss: 0.320680... Val Loss: 0.442378...
Epoch: 7/20... Step: 1500... Loss: 0.519998

In [16]:
# Getting test loss and accuracy
# 
##################################

import torch.nn.functional as F

test_losses = []
num_correct = 0
total_uc = 0 # total unclassified-if none of categories has more than 0.5 probability


def one_hot_decode(encoded):
  decoded = []
  #print("length of encoded array: ", len(encoded))
  for code in encoded:
    #print(code)
    index_array = numpy.nonzero(code)
    #print(index_array[0])
    if len(index_array[0])!=0:
      decoded.append(index_array[0][0])
    else:
      decoded.append(-1)
  
  #decoded = [numpy.nonzero(code)[0][0] for code in encoded if len(numpy.nonzero(code))!=0]
  #print("length of decoded array:", len(decoded))
  #print(decoded)
  return decoded


#initialize hidden
h = net.init_hidden(batch_size)

net.eval()

#iterating over the test set
for inputs, labels in test_loader:
  h = tuple([each.data for each in h])

  if train_on_gpu:
    inputs, labels = inputs.cuda(), labels.cuda()

  output, h = net(inputs, h)

  test_loss = criterion(output.squeeze(), labels)
  test_losses.append(test_loss.item())

  p = F.softmax(output, dim=1).data

  if train_on_gpu:
    p = p.cpu()

  # getting the index with highest probability - this will also be the pred label
  #pred = torch.argmax(p, dim=1)
  pred = torch.round(p)
  pred_classes = torch.from_numpy(numpy.array(one_hot_decode(pred.numpy())))
  correct_tensor = pred_classes.eq(labels.cpu())
  correct = numpy.squeeze(correct_tensor.numpy()) if not train_on_gpu else numpy.squeeze(correct_tensor.cpu().numpy())
  num_correct += numpy.sum(correct)

  #getting total unclassified outputs
  uc_indices = torch.where(pred_classes == -1)
  total_uc += len(uc_indices[0])


print("Total correct", num_correct)
print("total unclassified", total_uc)
print("Total test", len(test_loader.dataset))

# accuracy over all test data
test_acc = num_correct/(len(test_loader.dataset)-total_uc)
print("Test accuracy: {:.3f}".format(test_acc))
print("Test losses: {:.6f}".format(numpy.mean(val_losses)))
 

Total correct 490
total unclassified 146
Total test 865
Test accuracy: 0.682
Test losses: 0.396758
