In [1]:
cd /content/drive/My\ Drive/111\ Ethicsbot

/content/drive/My Drive/111 Ethicsbot


# Data Preprocessing

In [0]:
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn

The data needs to be in tsv format, with four columns, and no header.

In [3]:
file_path = "aita_training.csv"
data = pd.read_csv(file_path, header=0)
# drop na labels
data_ = data[data.label1.isna() == False]
print("There are only {} lines left after dropping na labels".format(data_.shape[0]))
# examine data skewness
print("Original labels are {}".format(np.unique(data_.label1.values)))
print("There are {} YTA labels, and {} NTA labels".format(len(data_[data_.label1 == "YTA"]),
                                                          len(data_[data_.label1 == "NTA"])))
D = data_[(data_.label1 == "YTA") | (data_.label1 == "NTA")]
# select useful cols
D_ = pd.concat([D.Title, D.Text, D.label1], axis=1).dropna()
# concat title and text
features = [D_.Title.iloc[i] + " " + D_.Text.iloc[i] for i in range(D_.shape[0])]
features = pd.Series(features, name="text")

  interactivity=interactivity, compiler=compiler, result=result)


There are only 27987 lines left after dropping na labels
Original labels are ['ESH' 'NAH' 'NTA' 'YTA']
There are 10251 YTA labels, and 14928 NTA labels


In [15]:
features.shape

(25149,)

In [16]:
features.head()

0    AITA for cleaning my son and DIL's apartment w...
1    AITA for doing it via text? So I went on three...
2    AITA: Not driving a co worker home. I work a j...
3    AITA for not sitting with someone who didn’t i...
4    AITA for asking my boyfriend to not be in the ...
Name: text, dtype: object

In [0]:
features = features.replace("\r\n\r\n", " ", regex=True)

In [0]:
features = features.replace("AITA", "Am I the asshole", regex=True)

In [0]:
features = features.replace("WIBTA", "Would I be the asshole", regex=True)

In [20]:
features

0        Am I the asshole for cleaning my son and DIL's...
1        Am I the asshole for doing it via text? So I w...
2        Am I the asshole: Not driving a co worker home...
3        Am I the asshole for not sitting with someone ...
4        Am I the asshole for asking my boyfriend to no...
                               ...                        
25144    Would I be the asshole if I exposed my ex-frie...
25145    Am I the asshole for getting a babysitter on m...
25146    Am I the asshole for not wanting to pay for th...
25147    Am I the asshole For not wanting to babysit my...
25148    Would I be the asshole for putting my foot dow...
Name: text, Length: 25149, dtype: object

In [25]:
# tokenize
X = features.apply(lambda x: tokenize(x))
# label encoding
D_.label1 = (D_.label1 == "NTA").astype("int")
y = D_.label1
# remove questions that are shorter than 15 words
sent_len = X.apply(lambda x: len(x))
print("max/average/min sentence length: {},{},{}".format(np.max(sent_len), np.mean(sent_len), np.min(sent_len)))
dat = pd.concat([features, pd.Series(y.values), sent_len], axis=1)
dat_ = pd.DataFrame(dat.values, columns=["x", "y", "len"])
dat_ = dat_.sort_values(by=["len"])
cleaned_data = dat_[dat_.len > 15]
print("Cleaned data shape: {}".format(cleaned_data.shape))


max/average/min sentence length: 3423,325.6089705356078,5
Cleaned data shape: (25130, 3)


# Train/test split

In [26]:
# train/dev split
X_train, X_test, y_train, y_test = train_test_split(cleaned_data.x, cleaned_data.y, 
                                                    test_size=0.2, random_state=42,
                                                    shuffle= True)
X_train.shape, X_test.shape

((20104,), (5026,))

In [27]:
train_df_bert = pd.DataFrame({
    'id':range(len(X_train)),
    'label':y_train,
    'alpha':['a']*X_train.shape[0],
    'text': X_train
})

train_df_bert.head()

Unnamed: 0,id,label,alpha,text
16629,0,1,a,Am I the asshole because I don't wanna help ta...
19153,1,0,a,Am I the asshole for always adding an egg to m...
11752,2,0,a,Am I the asshole for putting condoms in friend...
21014,3,1,a,Am I the asshole for not being happy about my ...
1064,4,1,a,"Am I the asshole, I want my best friend of nin..."


In [28]:
dev_df_bert = pd.DataFrame({
    'id':range(len(X_test)),
    'label':y_test,
    'alpha':['a']*X_test.shape[0],
    'text': X_test
})

dev_df_bert.head()

Unnamed: 0,id,label,alpha,text
17358,0,1,a,Am I the asshole for not kicking a work friend...
7707,1,1,a,Am I the asshole for saying no to being an org...
14293,2,0,a,Would I be the asshole if I cancelled a game o...
24231,3,0,a,Am I the asshole for peeing by my friends tent...
12885,4,1,a,Would I be the asshole if i asked my roommate ...


In [0]:
# # transform to bert friendly
# train_df_bert.to_csv('data/train.tsv', sep='\t', index=False, header=False)
# dev_df_bert.to_csv('data/dev.tsv', sep='\t', index=False, header=False)

In [29]:
train_df = pd.DataFrame({
    'text': train_df_bert.text.replace(r'\n', ' ', regex=True),
    'label': train_df_bert.label
})

print(train_df.head())

eval_df = pd.DataFrame({
    'text': dev_df_bert.text.replace(r'\n', ' ', regex=True),
    'label': dev_df_bert.label
})

print(eval_df.head())

                                                    text label
16629  Am I the asshole because I don't wanna help ta...     1
19153  Am I the asshole for always adding an egg to m...     0
11752  Am I the asshole for putting condoms in friend...     0
21014  Am I the asshole for not being happy about my ...     1
1064   Am I the asshole, I want my best friend of nin...     1
                                                    text label
17358  Am I the asshole for not kicking a work friend...     1
7707   Am I the asshole for saying no to being an org...     1
14293  Would I be the asshole if I cancelled a game o...     0
24231  Am I the asshole for peeing by my friends tent...     0
12885  Would I be the asshole if i asked my roommate ...     1


# BERT model

In [16]:
# !git clone https://www.github.com/nvidia/apex

Cloning into 'apex'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 6907 (delta 2), reused 5 (delta 0), pack-reused 6898[K
Receiving objects: 100% (6907/6907), 13.77 MiB | 10.70 MiB/s, done.
Resolving deltas: 100% (4633/4633), done.
Checking out files: 100% (267/267), done.


In [5]:
cd apex

/content/drive/My Drive/111 Ethicsbot/apex


In [6]:
!python setup.py install

torch.__version__  =  1.5.0+cu101
running install
running bdist_egg
running egg_info
writing apex.egg-info/PKG-INFO
writing dependency_links to apex.egg-info/dependency_links.txt
writing top-level names to apex.egg-info/top_level.txt
writing manifest file 'apex.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build/bdist.linux-x86_64/egg
creating build/bdist.linux-x86_64/egg/apex
copying build/lib/apex/__init__.py -> build/bdist.linux-x86_64/egg/apex
creating build/bdist.linux-x86_64/egg/apex/RNN
copying build/lib/apex/RNN/RNNBackend.py -> build/bdist.linux-x86_64/egg/apex/RNN
copying build/lib/apex/RNN/__init__.py -> build/bdist.linux-x86_64/egg/apex/RNN
copying build/lib/apex/RNN/models.py -> build/bdist.linux-x86_64/egg/apex/RNN
copying build/lib/apex/RNN/cells.py -> build/bdist.linux-x86_64/egg/apex/RNN
creating build/bdist.linux-x86_64/egg/apex/amp
copying build/lib/apex/amp/rnn_compat.py -> build/bdist.lin

In [7]:
pip install simpletransformers



In [8]:
pip install transformers 



In [0]:
# parameter setting
self.args = {
   'model_type':  'roberta',
   'model_name': 'roberta-base',
   'output_dir': 'outputs/',
   'cache_dir': 'cache/',
   'fp16': True,
   'fp16_opt_level': 'O1',
   'max_seq_length': 128,
   'train_batch_size': 32,
   'eval_batch_size': 32,
   'gradient_accumulation_steps': 1,
   'num_train_epochs': 10,
   'weight_decay': 0,
   'learning_rate': 0.01,
   'adam_epsilon': 1e-8,
   'warmup_ratio': 0.06,
   'warmup_steps': 0,
   'max_grad_norm': 1.0,
   'logging_steps': 50,
   'evaluate_during_training': False,
   'save_steps': 2000,
   'eval_all_checkpoints': True,
   'use_tensorboard': True,
   'overwrite_output_dir': True,
   'reprocess_input_data': False,
}

In [35]:
from simpletransformers.classification import ClassificationModel
import sklearn

# Create a TransformerModel
model = ClassificationModel('bert', 'bert-base-cased-finetuned-mrpc', args={'num_train_epochs': 4,
                                                             'overwrite_output_dir': True,
                                                             'train_batch_size': 32,
                                                             'eval_batch_size': 32,
                                                             'learning_rate': 2e-5})

# Train the model
model.train_model(train_df)

# Evaluate the model
# result, model_outputs, wrong_predictions = model.eval_model(eval_df)
result, model_outputs, wrong_predictions = model.eval_model(eval_df, 
                                                            acc=sklearn.metrics.accuracy_score,
                                                            auc=sklearn.metrics.roc_auc_score)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=20104.0), HTML(value='')))


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=629.0, style=ProgressStyle(descri…

Running loss: 1.620689Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Running loss: 1.772897



Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Running loss: 0.655058



Running loss: 0.779511




HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=629.0, style=ProgressStyle(descri…

Running loss: 0.386126


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=629.0, style=ProgressStyle(descri…

Running loss: 0.625195


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=629.0, style=ProgressStyle(descri…

Running loss: 0.215228



  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(FloatProgress(value=0.0, max=5026.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=158.0), HTML(value='')))




In [36]:
result

{'acc': 0.6510147234381217,
 'auc': 0.6313123718386877,
 'eval_loss': 0.7437014998514441,
 'fn': 728,
 'fp': 1026,
 'mcc': 0.27008939989369307,
 'tn': 1074,
 'tp': 2198}

In [38]:
# predicting 

predictions, raw_outputs = model.predict([eval_df.values[0,0]])
print(predictions, raw_outputs)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


[1] [[-1.4736328  1.0263672]]
