In [242]:
!pip install emoji transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [243]:
import os
from google.colab import files

if not os.path.isfile('kaggle.json'):
  files.upload()

In [244]:
if not os.path.exists('/root/.kaggle'):
  !mkdir ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 ~/.kaggle/kaggle.json
  !kaggle datasets list

In [245]:
if not os.path.exists('goemotions.zip'):
  !kaggle datasets download -d debarshichanda/goemotions

In [246]:
if not os.path.exists('input/goemotions'):
  !mkdir -p input/goemotions
  !unzip goemotions.zip -d input/goemotions

In [247]:
import re
import string
import json
import emoji
import numpy as np
import pandas as pd
from sklearn import metrics
from bs4 import BeautifulSoup
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [248]:
df_train = pd.read_csv("input/goemotions/data/train.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_dev = pd.read_csv("input/goemotions/data/dev.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [249]:
df_train['List of classes'] = df_train['Class'].apply(lambda x: x.split(','))
df_train['Len of classes'] = df_train['List of classes'].apply(lambda x: len(x))
df_dev['List of classes'] = df_dev['Class'].apply(lambda x: x.split(','))
df_dev['Len of classes'] = df_dev['List of classes'].apply(lambda x: len(x))

In [250]:
df_train.head(10)

Unnamed: 0,Text,Class,ID,List of classes,Len of classes
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1
3,To make her feel threatened,14,ed7ypvh,[14],1
4,Dirty Southern Wankers,3,ed0bdzj,[3],1
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,26,edvnz26,[26],1
6,Yes I heard abt the f bombs! That has to be wh...,15,ee3b6wu,[15],1
7,We need more boards and to create a bit more s...,820,ef4qmod,"[8, 20]",2
8,Damn youtube and outrage drama is super lucrat...,0,ed8wbdn,[0],1
9,It might be linked to the trust factor of your...,27,eczgv1o,[27],1


In [251]:
with open('input/goemotions/data/ekman_mapping.json') as file:
    ekman_mapping = json.load(file)

In [252]:
ekman_mapping

{'anger': ['anger', 'annoyance', 'disapproval'],
 'disgust': ['disgust'],
 'fear': ['fear', 'nervousness'],
 'joy': ['joy',
  'amusement',
  'approval',
  'excitement',
  'gratitude',
  'love',
  'optimism',
  'relief',
  'pride',
  'admiration',
  'desire',
  'caring'],
 'sadness': ['sadness', 'disappointment', 'embarrassment', 'grief', 'remorse'],
 'surprise': ['surprise', 'realization', 'confusion', 'curiosity']}

In [253]:
emotion_file = open("input/goemotions/data/emotions.txt", "r")
emotion_list = emotion_file.read()
emotion_list = emotion_list.split("\n")
print(emotion_list)

['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']


In [254]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr

In [255]:
df_train['Emotions'] = df_train['List of classes'].apply(idx2class)
df_dev['Emotions'] = df_dev['List of classes'].apply(idx2class)

In [256]:
def EmotionMapping(emotion_list):
    map_list = []
    
    for i in emotion_list:
        if i in ekman_mapping['anger']:
            map_list.append('anger')
        if i in ekman_mapping['disgust']:
            map_list.append('disgust')
        if i in ekman_mapping['fear']:
            map_list.append('fear')
        if i in ekman_mapping['joy']:
            map_list.append('joy')
        if i in ekman_mapping['sadness']:
            map_list.append('sadness')
        if i in ekman_mapping['surprise']:
            map_list.append('surprise')
        if i == 'neutral':
            map_list.append('neutral')
            
    return map_list

In [257]:
df_train['Mapped Emotions'] = df_train['Emotions'].apply(EmotionMapping)
df_dev['Mapped Emotions'] = df_dev['Emotions'].apply(EmotionMapping)

In [258]:
df_train.head(10)

Unnamed: 0,Text,Class,ID,List of classes,Len of classes,Emotions,Mapped Emotions
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1,[neutral],[neutral]
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1,[neutral],[neutral]
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1,[anger],[anger]
3,To make her feel threatened,14,ed7ypvh,[14],1,[fear],[fear]
4,Dirty Southern Wankers,3,ed0bdzj,[3],1,[annoyance],[anger]
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,26,edvnz26,[26],1,[surprise],[surprise]
6,Yes I heard abt the f bombs! That has to be wh...,15,ee3b6wu,[15],1,[gratitude],[joy]
7,We need more boards and to create a bit more s...,820,ef4qmod,"[8, 20]",2,"[desire, optimism]","[joy, joy]"
8,Damn youtube and outrage drama is super lucrat...,0,ed8wbdn,[0],1,[admiration],[joy]
9,It might be linked to the trust factor of your...,27,eczgv1o,[27],1,[neutral],[neutral]


In [259]:
df_train['anger'] = np.zeros((len(df_train),1))
df_train['disgust'] = np.zeros((len(df_train),1))
df_train['fear'] = np.zeros((len(df_train),1))
df_train['joy'] = np.zeros((len(df_train),1))
df_train['sadness'] = np.zeros((len(df_train),1))
df_train['surprise'] = np.zeros((len(df_train),1))
df_train['neutral'] = np.zeros((len(df_train),1))

df_dev['anger'] = np.zeros((len(df_dev),1))
df_dev['disgust'] = np.zeros((len(df_dev),1))
df_dev['fear'] = np.zeros((len(df_dev),1))
df_dev['joy'] = np.zeros((len(df_dev),1))
df_dev['sadness'] = np.zeros((len(df_dev),1))
df_dev['surprise'] = np.zeros((len(df_dev),1))
df_dev['neutral'] = np.zeros((len(df_dev),1))

In [260]:
df_train.head(10)

Unnamed: 0,Text,Class,ID,List of classes,Len of classes,Emotions,Mapped Emotions,anger,disgust,fear,joy,sadness,surprise,neutral
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1,[neutral],[neutral],0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1,[neutral],[neutral],0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1,[anger],[anger],0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,To make her feel threatened,14,ed7ypvh,[14],1,[fear],[fear],0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Dirty Southern Wankers,3,ed0bdzj,[3],1,[annoyance],[anger],0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,26,edvnz26,[26],1,[surprise],[surprise],0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Yes I heard abt the f bombs! That has to be wh...,15,ee3b6wu,[15],1,[gratitude],[joy],0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,We need more boards and to create a bit more s...,820,ef4qmod,"[8, 20]",2,"[desire, optimism]","[joy, joy]",0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Damn youtube and outrage drama is super lucrat...,0,ed8wbdn,[0],1,[admiration],[joy],0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,It might be linked to the trust factor of your...,27,eczgv1o,[27],1,[neutral],[neutral],0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
for i in ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise','neutral']:
    df_train[i] = df_train['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)
    df_dev[i] = df_dev['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)

In [262]:
df_train.head(10)

Unnamed: 0,Text,Class,ID,List of classes,Len of classes,Emotions,Mapped Emotions,anger,disgust,fear,joy,sadness,surprise,neutral
0,My favourite food is anything I didn't have to...,27,eebbqej,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
1,"Now if he does off himself, everyone will thin...",27,ed00q6i,[27],1,[neutral],[neutral],0,0,0,0,0,0,1
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj,[2],1,[anger],[anger],1,0,0,0,0,0,0
3,To make her feel threatened,14,ed7ypvh,[14],1,[fear],[fear],0,0,1,0,0,0,0
4,Dirty Southern Wankers,3,ed0bdzj,[3],1,[annoyance],[anger],1,0,0,0,0,0,0
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,26,edvnz26,[26],1,[surprise],[surprise],0,0,0,0,0,1,0
6,Yes I heard abt the f bombs! That has to be wh...,15,ee3b6wu,[15],1,[gratitude],[joy],0,0,0,1,0,0,0
7,We need more boards and to create a bit more s...,820,ef4qmod,"[8, 20]",2,"[desire, optimism]","[joy, joy]",0,0,0,1,0,0,0
8,Damn youtube and outrage drama is super lucrat...,0,ed8wbdn,[0],1,[admiration],[joy],0,0,0,1,0,0,0
9,It might be linked to the trust factor of your...,27,eczgv1o,[27],1,[neutral],[neutral],0,0,0,0,0,0,1


In [263]:
df_train.drop(df_train[df_train['neutral'] == 1].index, inplace=True)
df_dev.drop(df_dev[df_dev['neutral'] == 1].index, inplace=True)
df_train.drop(df_train[df_train['disgust'] == 1].index, inplace=True)
df_dev.drop(df_dev[df_dev['disgust'] == 1].index, inplace=True)

In [264]:
df_train.drop(['Class', 'List of classes', 'Len of classes', 'Emotions', 'Mapped Emotions', 'neutral', 'disgust'], axis=1, inplace=True)
df_dev.drop(['Class', 'List of classes', 'Len of classes', 'Emotions', 'Mapped Emotions', 'neutral', 'disgust'], axis=1, inplace=True)

In [265]:
df_train.head(10)

Unnamed: 0,Text,ID,anger,fear,joy,sadness,surprise
2,WHY THE FUCK IS BAYLESS ISOING,eezlygj,1,0,0,0,0
3,To make her feel threatened,ed7ypvh,0,1,0,0,0
4,Dirty Southern Wankers,ed0bdzj,1,0,0,0,0
5,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,edvnz26,0,0,0,0,1
6,Yes I heard abt the f bombs! That has to be wh...,ee3b6wu,0,0,1,0,0
7,We need more boards and to create a bit more s...,ef4qmod,0,0,1,0,0
8,Damn youtube and outrage drama is super lucrat...,ed8wbdn,0,0,1,0,0
10,Demographics? I don’t know anybody under 35 wh...,eel6g5h,0,0,0,0,1
11,"Aww... she'll probably come around eventually,...",edex4ki,0,0,1,0,0
13,R/sleeptrain Might be time for some sleep trai...,efh7xnk,0,0,1,0,0


In [266]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                       "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
                       "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                       "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                       "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would",
                       "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                       "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                       "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s':'america', 'e.g':'for example'}

punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
                 "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!':' '}

mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
                'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization',
                'demonetisation': 'demonetization'}

In [267]:
def clean_text(text):
    '''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = emoji.demojize(text)
    text = re.sub(r'\:(.*?)\:','',text)
    text = str(text).lower()    #Making Text Lowercase
    text = re.sub('\[.*?\]', '', text)
    #The next 2 lines remove html text
    text = BeautifulSoup(text, 'lxml').get_text()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
    return text

def clean_contractions(text, mapping):
    '''Clean contraction using contraction mapping'''    
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    for word in mapping.keys():
        if ""+word+"" in text:
            text = text.replace(""+word+"", ""+mapping[word]+"")
    #Remove Punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    return text

def clean_special_chars(text, punct, mapping):
    '''Cleans special characters present(if any)'''   
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def correct_spelling(x, dic):
    '''Corrects common spelling errors'''   
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def remove_space(text):
    '''Removes awkward spaces'''   
    #Removes awkward spaces 
    text = text.strip()
    text = text.split()
    return " ".join(text)

def text_preprocessing_pipeline(text):
    '''Cleaning and parsing the text.'''
    text = clean_text(text)
    text = clean_contractions(text, contraction_mapping)
    text = clean_special_chars(text, punct, punct_mapping)
    text = correct_spelling(text, mispell_dict)
    text = remove_space(text)
    return text

In [268]:
# df_train['Text'] = df_train['Text'].apply(text_preprocessing_pipeline)
# df_dev['Text'] = df_dev['Text'].apply(text_preprocessing_pipeline)

In [269]:
df_train.reset_index(drop=True).to_csv("train.csv", index=False)
df_dev.reset_index(drop=True).to_csv("val.csv", index=False)

In [270]:
df_train = df_train.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)

In [271]:
df_train.head(10)

Unnamed: 0,Text,ID,anger,fear,joy,sadness,surprise
0,WHY THE FUCK IS BAYLESS ISOING,eezlygj,1,0,0,0,0
1,To make her feel threatened,ed7ypvh,0,1,0,0,0
2,Dirty Southern Wankers,ed0bdzj,1,0,0,0,0
3,OmG pEyToN iSn'T gOoD eNoUgH tO hElP uS iN tHe...,edvnz26,0,0,0,0,1
4,Yes I heard abt the f bombs! That has to be wh...,ee3b6wu,0,0,1,0,0
5,We need more boards and to create a bit more s...,ef4qmod,0,0,1,0,0
6,Damn youtube and outrage drama is super lucrat...,ed8wbdn,0,0,1,0,0
7,Demographics? I don’t know anybody under 35 wh...,eel6g5h,0,0,0,0,1
8,"Aww... she'll probably come around eventually,...",edex4ki,0,0,1,0,0
9,R/sleeptrain Might be time for some sleep trai...,efh7xnk,0,0,1,0,0


In [272]:
df_dev.head(10)

Unnamed: 0,Text,ID,anger,fear,joy,sadness,surprise
0,I've never been this sad in my life!,edcu99z,0,0,0,1,0
1,He could have easily taken a real camera from ...,eepig6r,0,0,1,0,0
2,"Thank you for your vote of confidence, but we ...",eczm50f,0,0,1,0,0
3,Wah Mum other people call me on my bullshit an...,ed4yr9r,1,0,0,0,0
4,At least now [NAME] has more time to gain his ...,eekez9p,0,0,1,0,0
5,Good. We don't want more thrash liberal offspr...,ee0fxpu,1,0,0,0,0
6,It's better to say a moment like that could tr...,eelgxk0,0,0,1,0,0
7,I went to a destination wedding being the only...,eczyj0h,0,0,0,1,1
8,He died 4 days later of dehydration,edwloev,0,0,0,1,0
9,Lol dream on buddy. You’ve had enough attentio...,ed2znun,1,0,1,1,0


In [273]:
print(df_train.shape)
print(df_dev.shape)

(28427, 7)
(3564, 7)


In [274]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [275]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [276]:
target_cols = [col for col in df_train.columns if col not in ['Text', 'ID']]
target_cols

['anger', 'fear', 'joy', 'sadness', 'surprise']

In [277]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.Text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [278]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
valid_dataset = BERTDataset(df_dev, tokenizer, MAX_LEN)

In [279]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=4, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=VALID_BATCH_SIZE, 
                          num_workers=4, shuffle=False, pin_memory=True)

In [280]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768,5)
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

    def predict(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output


model = BERTClass()
model.to(device);

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [281]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [282]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)

In [283]:
idx = 0
for _, data in enumerate(train_loader, 0):
  ids = data['ids'].to(device, dtype = torch.long)
  mask = data['mask'].to(device, dtype = torch.long)
  token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
  targets = data['targets'].to(device, dtype = torch.float)
  idx += 1
  if idx == 4:
    break

In [284]:
print(ids.shape)
print(mask.shape)
print(token_type_ids.shape)
print(targets.shape)

torch.Size([64, 200])
torch.Size([64, 200])
torch.Size([64, 200])
torch.Size([64, 5])


In [285]:
ids[63]

tensor([  0, 713,  16,  98,   6,  98, 372,   4,   2,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,  

In [286]:
mask[63]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [287]:
token_type_ids[63]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')

In [288]:
targets[63]

tensor([0., 0., 1., 0., 0.], device='cuda:0')

In [289]:
def train(epoch):
    model.train()
    for _, data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _%500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [290]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.7013913989067078
Epoch: 1, Loss:  0.16987574100494385
Epoch: 2, Loss:  0.12768509984016418
Epoch: 3, Loss:  0.1525919884443283
Epoch: 4, Loss:  0.1342325061559677
Epoch: 5, Loss:  0.11880135536193848
Epoch: 6, Loss:  0.10655412822961807
Epoch: 7, Loss:  0.051942128688097
Epoch: 8, Loss:  0.04491617903113365
Epoch: 9, Loss:  0.03040575422346592


In [291]:
def validation():
  model.eval()
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
    for _, data in enumerate(valid_loader, 0):
      ids = data['ids'].to(device, dtype=torch.long)
      mask = data['mask'].to(device, dtype=torch.long)
      token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
      targets = data['targets'].to(device, dtype=torch.float)
      outputs = model(ids, mask, token_type_ids)
      fin_targets.extend(targets.cpu().detach().numpy().tolist())
      fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
  return fin_outputs, fin_targets

In [292]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.7567340067340067
F1 Score (Micro) = 0.8249544152122948
F1 Score (Macro) = 0.7512207685206688


In [302]:
def sentiment_predict(new_sentence):
  model.eval()
  fin_outputs = []
  with torch.no_grad():
    inputs = tokenizer.encode_plus(
              new_sentence,
              truncation=True,
              add_special_tokens=True,
              max_length=MAX_LEN,
              padding='max_length',
              return_token_type_ids=True
          )
    
    
    

    ids = torch.tensor(inputs['input_ids'], dtype=torch.long).view(-1, 200)
    mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).view(-1, 200)
    token_type_ids = torch.tensor(inputs["token_type_ids"], dtype=torch.long).view(-1, 200)

    ids = ids.to(device, dtype = torch.long)
    mask = mask.to(device, dtype = torch.long)
    token_type_ids = token_type_ids.to(device, dtype = torch.long)
    outputs = model.predict(ids, mask, token_type_ids)
    fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    fin_outputs = np.array(fin_outputs) >= 0.5

  return fin_outputs

In [315]:
df_dev.head(30)

Unnamed: 0,Text,ID,anger,fear,joy,sadness,surprise
0,I've never been this sad in my life!,edcu99z,0,0,0,1,0
1,He could have easily taken a real camera from ...,eepig6r,0,0,1,0,0
2,"Thank you for your vote of confidence, but we ...",eczm50f,0,0,1,0,0
3,Wah Mum other people call me on my bullshit an...,ed4yr9r,1,0,0,0,0
4,At least now [NAME] has more time to gain his ...,eekez9p,0,0,1,0,0
5,Good. We don't want more thrash liberal offspr...,ee0fxpu,1,0,0,0,0
6,It's better to say a moment like that could tr...,eelgxk0,0,0,1,0,0
7,I went to a destination wedding being the only...,eczyj0h,0,0,0,1,1
8,He died 4 days later of dehydration,edwloev,0,0,0,1,0
9,Lol dream on buddy. You’ve had enough attentio...,ed2znun,1,0,1,1,0


In [317]:
test = df_dev.iloc[0]
text = test.Text
print(text)
print(sentiment_predict(text))
print(test)

I've never been this sad in my life!
[[False False False  True False]]
Text        I've never been this sad in my life!
ID                                       edcu99z
anger                                          0
fear                                           0
joy                                            0
sadness                                        1
surprise                                       0
Name: 0, dtype: object


In [316]:
test = df_dev.iloc[7]
text = test.Text
print(text)
print(sentiment_predict(text))
print(test)

I went to a destination wedding being the only single person. Promised to never put myself in that situation again.
[[False False  True False False]]
Text        I went to a destination wedding being the only...
ID                                                    eczyj0h
anger                                                       0
fear                                                        0
joy                                                         0
sadness                                                     1
surprise                                                    1
Name: 7, dtype: object


In [318]:
test = df_dev.iloc[9]
text = test.Text
print(text)
print(sentiment_predict(text))
print(test)

Lol dream on buddy. You’ve had enough attention today. Actually learn what your talking about helps a lot. Sorry your stuck in free roam smokin crack
[[False False  True  True False]]
Text        Lol dream on buddy. You’ve had enough attentio...
ID                                                    ed2znun
anger                                                       1
fear                                                        0
joy                                                         1
sadness                                                     1
surprise                                                    0
Name: 9, dtype: object


In [319]:
test = df_dev.iloc[16]
text = test.Text
print(text)
print(sentiment_predict(text))
print(test)

oh shoot, im sorry to hear that. was it someone close to you? 
[[False False False  True  True]]
Text        oh shoot, im sorry to hear that. was it someon...
ID                                                    ed2qbu3
anger                                                       0
fear                                                        0
joy                                                         0
sadness                                                     1
surprise                                                    1
Name: 16, dtype: object
