In [1]:
import gzip
import random
import string
from collections import defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

from torch import nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize 

from gensim.models import Word2Vec

from tqdm import tqdm, notebook

%load_ext memory_profiler

In [2]:
data_path = 'aita_raw.csv'

In [3]:
df = pd.read_csv(data_path)
df = df.dropna(axis=0)
df['verdict'] = df['verdict'].str.lower()
df

Unnamed: 0,id,timestamp,title,body,edited,verdict,score,num_comments
0,1ytr72,1.393275e+09,[AITA] Construction worker here,I have been on a parking structure project for...,False,too close to call,63,9.0
1,1ytxov,1.393279e+09,[AITA] I wrote an explanation in TIL and came ...,[Here is the post in question](http://www.redd...,False,asshole,52,13.0
2,1yu29c,1.393281e+09,[AITA] Threw my parent's donuts away,"My parents are diabetic, morbidly obese, and a...",1393290576.0,asshole,140,27.0
3,1yu41e,1.393282e+09,[AITA] I Put My Empty Beer on a Bar Table,"Relevant Facts:\n\n1) It was a crowded bar, th...",False,nothing happened,45,7.0
4,1yu8hi,1.393285e+09,I told a goth girl she looked like a clown.,I was four.,False,not the asshole,74,15.0
...,...,...,...,...,...,...,...,...
167755,ex970f,1.580577e+09,AITA for telling my husband to f* off after he...,My husband (28M) and I (32F) are married for a...,1580584475.0,not the a-hole,1373,304.0
167756,ex97ye,1.580577e+09,AITA for refusing to give my ticket to my brot...,[deleted],False,no a-holes here,4,16.0
167758,ex9dwo,1.580578e+09,AITA for attempting to keep my students out of...,Upfront apologies for formatting. I’m also try...,False,not the a-hole,4,15.0
167759,ex9egs,1.580578e+09,WIBTA if I left my brothers fate up to the state?,A little back story my mom is a drug addict an...,False,not the a-hole,280,140.0


In [4]:
counts = df['verdict'].value_counts()
print(counts)
v = df['verdict']
df = df[v.replace(counts.gt(300))]
replace_dict = {
    'not the asshole': 'not the a-hole',
    'no a--holes here': 'no a-holes here'
}
df = df.replace({'verdict': replace_dict})

not the a-hole           75393
asshole                  27449
no a-holes here          15129
everyone sucks            7417
not enough info           2713
                         ...  
not the a-hole (meta)        1
should've towed              1
not the slave                1
not not a-hole               1
rant                         1
Name: verdict, Length: 410, dtype: int64


In [5]:
df['verdict'].value_counts()

not the a-hole     75766
asshole            27449
no a-holes here    15432
everyone sucks      7417
not enough info     2713
Name: verdict, dtype: int64

In [6]:
v = df['verdict']
df = df[v.replace(counts.gt(3000))]
replace_dict = {
    'no a-holes here': 'not the a-hole',
    'everyone sucks': 'asshole'
}
df = df.replace({'verdict': replace_dict})
df['verdict'].value_counts()

not the a-hole    91198
asshole           34866
Name: verdict, dtype: int64

In [7]:
v = df['body']
df = df[v != '[deleted]']
df['verdict'].value_counts()

not the a-hole    71536
asshole           26863
Name: verdict, dtype: int64

In [8]:
verdicts = df['verdict'].unique()
print(verdicts)
num_classes = len(verdicts)
print(num_classes)

['asshole' 'not the a-hole']
2


In [9]:
class LemmaTokenizer:
    ignore_tokens = string.punctuation
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) 
                if not any(i in t for i in self.ignore_tokens) and len(t) > 2]

In [10]:
df['edited'] = [0 if 'false' in d else 1 for d in df['edited'].str.lower() ]

In [11]:
df['num_comments'] = df['num_comments'].astype(int)
df['score'] = df['score'].astype(int)

In [12]:
df['body'] = df['body'].str.lower()
df['body'] = df['body'].str.replace('[^\w\s]', ' ')
df['body'] = df['body'].str.replace('\d+', '')

In [13]:
classes = df['verdict'].unique()
verdict_map = dict(zip(classes, range(len(classes))))
df['verdict'] = df['verdict'].replace(verdict_map)

In [14]:
df_train, df_temp = train_test_split(df, test_size=0.2, stratify=df['verdict'])
df_val, df_test = train_test_split(df_temp, test_size=0.5, stratify=df_temp['verdict'])

In [15]:
del df

In [16]:
transformed_data = df_train[['edited', 'score', 'num_comments']].to_numpy()
labels = df_train['verdict']

In [17]:
tokenizer=LemmaTokenizer()
word_list = ['aita']

stop_words = set(ENGLISH_STOP_WORDS.union(word_list))
tokenized_stop = tokenizer(' '.join(stop_words))

In [18]:
vectorizer = TfidfVectorizer(max_features=1000, sublinear_tf=True, max_df=0.95, 
                             min_df=5, ngram_range=(1, 2), tokenizer=tokenizer, stop_words=tokenized_stop)
text_features = vectorizer.fit_transform(df_train['body']).toarray()
print(text_features.shape)

(78719, 1000)


In [19]:
feature_names = vectorizer.get_feature_names()

In [20]:
%memit

peak memory: 2231.68 MiB, increment: -27.46 MiB


In [21]:
text_features = np.array(np.hstack((transformed_data, text_features)))
print(text_features.shape)

(78719, 1003)


In [22]:
%memit

peak memory: 2236.47 MiB, increment: 0.00 MiB


In [23]:
X_train = text_features
y_train = np.array(labels)
input_size = len(X_train[0])
print(input_size)
print(X_train.shape)
print(y_train.shape)

1003
(78719, 1003)
(78719,)


In [25]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
print(X_resampled.shape)
print(y_resampled.shape)

(42980, 1003)
(42980,)


In [26]:
feature_rank = SelectKBest(chi2, k=500)
feature_rank.fit(X_train, y_train)

SelectKBest(k=500, score_func=<function chi2 at 0x7f1e6fbae9e0>)

In [27]:
feat = []
for i, (score, feature) in enumerate(zip(feature_rank.scores_, feature_names)):
    feat.append((score, feature))
    
dfObj = pd.DataFrame(feat) 
dfObj.sort_values(by=[0], ascending = False)

Unnamed: 0,0,1
2,2.001124e+05,abusive
1,1.191386e+05,absolutely
0,5.651446e+02,able
268,7.679568e+01,emotional
580,3.969713e+01,money
...,...,...
114,2.923383e-05,car
238,7.657551e-06,don care
806,3.289485e-06,social medium
690,1.845324e-06,question


In [28]:
transformed_val = df_val[['edited', 'score', 'num_comments']].to_numpy()
labels_val = df_val['verdict']
X_val = np.array(np.hstack((transformed_val, vectorizer.transform(df_val['body']).toarray())))
y_val = np.array(labels_val)
del transformed_val
print(X_val.shape)
print(y_val.shape)

(9840, 1003)
(9840,)


In [29]:
%memit

peak memory: 2650.51 MiB, increment: -0.31 MiB


In [30]:
# process = make_pipeline(PowerTransformer(method='yeo-johnson'))
# ct = ColumnTransformer([("power", process, [x for x in range(0, 6)])], remainder='passthrough')
# X_train = ct.fit_transform(X_train)
# X_test = ct.transform(X_test)

In [31]:
%memit

peak memory: 2650.51 MiB, increment: 0.00 MiB


In [32]:
train_dataset = Data.TensorDataset(Variable(torch.from_numpy(X_train.astype(np.float32))), 
                                   Variable(torch.from_numpy(y_train.astype(np.long))))
test_dataset = Data.TensorDataset(Variable(torch.from_numpy(X_val.astype(np.float32))), 
                                  Variable(torch.from_numpy(y_val.astype(np.long))))

In [33]:
class TrainHelper():
    '''
    Helper class that makes it a bit easier and cleaner to define the training routine
    
    '''
    
    def __init__(self,model,train_set,test_set,opts):
        self.model = model 
        
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        
        self.epochs = opts['epochs']
        self.optimizer = torch.optim.AdamW(model.parameters(), lr=opts['lr'], weight_decay=opts['weight_decay']) # optimizer method for gradient descent
        self.criterion = torch.nn.CrossEntropyLoss()                      # loss function
        self.train_loader = torch.utils.data.DataLoader(dataset=train_set,
                                                        batch_size=opts['batch_size'],
                                                        shuffle=True)
        self.test_loader = torch.utils.data.DataLoader(dataset=test_set,
                                                       batch_size=opts['batch_size'],
                                                       shuffle=True)
        
    def train(self):
        self.model.train() #put model in training mode
        self.train_accuracy = []

        for epoch in range(self.epochs):
            self.tr_loss = []
            for i, (data,labels) in notebook.tqdm(enumerate(self.train_loader),
                                                   total = len(self.train_loader)):

                data, labels = data.to(self.device),labels.to(self.device)
                self.optimizer.zero_grad()  
                outputs = self.model(data)  
                loss = self.criterion(outputs, labels) 
                loss.backward()                        
                self.optimizer.step()                  
                self.tr_loss.append(loss.item())
                y_pred_softmax = torch.log_softmax(outputs.data, dim = 1)
                _, predicted = torch.max(y_pred_softmax, dim = 1)
                self.train_accuracy.append((predicted == labels).sum().item() / predicted.size(0))

            
            self.test(epoch) # run through the validation set
        
    def test(self,epoch):
            
            self.model.eval()    # puts model in eval mode - not necessary for this demo but good to know
            self.test_loss = []
            self.test_accuracy = []
            
            for i, (data, labels) in enumerate(self.test_loader):
                
                data, labels = data.to(self.device),labels.to(self.device)
                
                # pass data through network
                # turn off gradient calculation to speed up calcs and reduce memory
                with torch.no_grad():
                    outputs = self.model(data)
                
                # make our predictions and update our loss info
                loss = self.criterion(outputs, labels)
                self.test_loss.append(loss.item())
                y_pred_softmax = torch.log_softmax(outputs.data, dim = 1)
                _, predicted = torch.max(y_pred_softmax, dim = 1)
                self.test_accuracy.append((predicted == labels).sum().item() / predicted.size(0))
            
            print('epoch: {}, train loss: {}, test loss: {}, train accuracy: {}, test accuracy: {}'.format( 
                  epoch+1, np.mean(self.tr_loss), np.mean(self.test_loss), 
                  np.mean(self.train_accuracy), np.mean(self.test_accuracy)))

In [34]:
class Net(torch.nn.Module):
    def __init__(self, input_size, classes):
        super(Net, self).__init__()

        self.fc1 = torch.nn.Linear(input_size, 2000)
        self.relu1 = torch.nn.ReLU()
        self.drop1 = torch.nn.Dropout(0.8)
        self.bn1 = torch.nn.BatchNorm1d(2000)

        self.fc2 = torch.nn.Linear(2000, 1000)
        self.relu2 = torch.nn.ReLU()
        self.drop2 = torch.nn.Dropout(0.5)
        self.bn2 = torch.nn.BatchNorm1d(1000)


        self.fc3 = torch.nn.Linear(1000, 500)
        self.relu3 = torch.nn.ReLU()
        self.drop3 = torch.nn.Dropout(0.2)
        self.bn3 = torch.nn.BatchNorm1d(500)


        self.fc4 = torch.nn.Linear(500, classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.drop1(x)
        x = self.bn1(x)
        
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.drop2(x)
        x = self.bn2(x)
        
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.drop3(x)
        x = self.bn3(x)
        
        x = self.fc4(x)
        return x


In [35]:
model = Net(input_size=input_size, classes=num_classes)
opts = {
    'lr': 1e-4,
    'weight_decay': 4e-3,
    'epochs': 5,
    'batch_size': 64
}



NetTrainer = TrainHelper(model = model,
                      train_set = train_dataset,
                      test_set = test_dataset,opts = opts)

In [36]:
NetTrainer.train()

HBox(children=(FloatProgress(value=0.0, max=1230.0), HTML(value='')))


epoch: 1, train loss: 0.5991532109374923, test loss: 0.5666547814359912, train accuracy: 0.705955002258356, test accuracy: 0.739752435064935


HBox(children=(FloatProgress(value=0.0, max=1230.0), HTML(value='')))


epoch: 2, train loss: 0.5555298519570653, test loss: 0.5343112395955371, train accuracy: 0.7272387969415408, test accuracy: 0.746178300865801


HBox(children=(FloatProgress(value=0.0, max=1230.0), HTML(value='')))


epoch: 3, train loss: 0.5212569423322755, test loss: 0.5292709787170609, train accuracy: 0.7357220824192369, test accuracy: 0.7483089826839827


HBox(children=(FloatProgress(value=0.0, max=1230.0), HTML(value='')))


epoch: 4, train loss: 0.5236331029151513, test loss: 0.5249236887925631, train accuracy: 0.7404690322944896, test accuracy: 0.7516910173160173


HBox(children=(FloatProgress(value=0.0, max=1230.0), HTML(value='')))


epoch: 5, train loss: 0.5080004737144563, test loss: 0.5310277557605273, train accuracy: 0.7438558604336044, test accuracy: 0.7505073051948052


In [35]:
model_path = 'predict_aita_nn.pt'

In [36]:
# torch.save(model.state_dict(), model_path)

In [37]:
# model = Net(input_size=input_size, classes=num_classes)
# model.load_state_dict(torch.load(model_path))
# print(model)

In [38]:
def clean_datum(datum):
    datum['early_access'] = int(datum['early_access'])
    datum['hours'] = np.log1p(datum['hours']) / max_hours if datum['hours'] > 0 else 0
    datum['text'] = datum['text'].lower()
    datum['text'] = datum['text'].replace('[^\w\s]', ' ')
    datum['text'] = datum['text'].replace('\d+', ' ')
    time = pd.to_datetime(datum['date'])
    time_tran = [(np.sin(2 * np.pi* time.month / 12) + 1) / 2, (np.cos(2 * np.pi* time.month / 12) + 1) / 2, 
                   (np.sin(2 * np.pi* time.day / 31) + 1) / 2, (np.cos(2 * np.pi* time.day / 31) + 1) / 2]
    del time
    
    transformed_datum = np.array([datum['early_access'], datum['hours'], *time_tran]).reshape((1, -1))
    del time_tran
    feat = vectorizer.transform([datum['text']]).toarray()
    transformed_datum = np.array(np.hstack((transformed_datum, feat))).astype(np.float32)
    del feat
#     transformed_datum = ct.transform(transformed_datum)

    return transformed_datum

In [39]:
def readJSON(path):
    for l in gzip.open(path, 'rt'):
        d = eval(l)
        u = d['userID']
        try:
            g = d['gameID']
        except Exception as e:
            g = None
        yield u,g,d

In [40]:
predictions = open("predictions_Category.txt", 'w')
predictions.write("userID-reviewID,prediction\n")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
for u,_,d in tqdm(readJSON(test_path)):
    revID = d['reviewID']
    d = clean_datum(d)
    d = torch.from_numpy(d).to(device)
    out = model(d)
    y_pred_softmax = torch.log_softmax(out.data, dim = 1)
    _, pred = torch.max(y_pred_softmax, dim = 1)
    pred = pred.item()
    
    predictions.write(f'{u}-{revID},{pred}\n')

predictions.close()

10000it [00:21, 455.89it/s]
