In [281]:
import os
import re
import string
import json
import emoji
import numpy as np
import pandas as pd
from sklearn import metrics
from bs4 import BeautifulSoup
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)

In [208]:
# train/test dataset from https://www.kaggle.com/datasets/5c493139434db3371ed713ee501ed0cfa1ed39e6f808feb59bae312379767830/code?resource=download
# kaggle GoEmotions
# https://www.kaggle.com/code/debarshichanda/bert-multi-label-text-classification

In [282]:
df_train = pd.read_csv("/home/keonwoo/anaconda3/envs/bgmRS/data/eng/train_eng.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])
df_dev = pd.read_csv("/home/keonwoo/anaconda3/envs/bgmRS/data/eng/dev_eng.tsv", sep='\t', header=None, names=['Text', 'Class', 'ID'])

In [283]:
df_train['List of classes'] = df_train['Class'].apply(lambda x: x.split(','))
df_train['Len of classes'] = df_train['List of classes'].apply(lambda x: len(x))
df_dev['List of classes'] = df_dev['Class'].apply(lambda x: x.split(','))
df_dev['Len of classes'] = df_dev['List of classes'].apply(lambda x: len(x))

In [284]:
ekman_mapping = {
    "Chill" : ["pride"],
    "Exciting(Tense)" : ["excitement", "joy"],
    "Funny" : ["surprise", "curiosity", "amusement"] ,
    "Groovy" : ["desire", "gratitude"],
    "Peaceful" : ["caring", "relief"],
    "Romantic" : ["love"],
    "Scary/Creepy/Mysterious" : ["fear", "nervousness", "anger", "annoyance", "disapproval", "disgust"],
    "Touching" : ["sadness", "disappointment", "remorse"],
    "Uplifting/Hopeful" : ["admiration", "grief", "optimism"]
}

In [285]:
"""
ekman_mapping = {
"anger": ["anger", "annoyance", "disapproval"],
"disgust": ["disgust"],
"fear": ["fear", "nervousness"],
"joy": ["joy", "amusement", "approval", "excitement", "gratitude",  "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],
"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
"surprise": ["surprise", "realization", "confusion", "curiosity"]
}
"""
emotion_list = ['admiration',
'amusement',
'anger',
'annoyance',
'approval',
'caring',
'confusion',
'curiosity',
'desire',
'disappointment',
'disapproval',
'disgust',
'embarrassment',
'excitement',
'fear',
'gratitude',
'grief',
'joy',
'love',
'nervousness',
'optimism',
'pride',
'realization',
'relief',
'remorse',
'sadness',
'surprise',
'neutral']

In [286]:
def idx2class(idx_list):
    arr = []
    for i in idx_list:
        arr.append(emotion_list[int(i)])
    return arr

In [287]:
df_train['Emotions'] = df_train['List of classes'].apply(idx2class)
df_dev['Emotions'] = df_dev['List of classes'].apply(idx2class)

In [288]:
def EmotionMapping(emotion_list):
    map_list = []
    
    for i in emotion_list:
        if i in ekman_mapping['Chill']:
            map_list.append('Chill')
        if i in ekman_mapping['Exciting(Tense)']:
            map_list.append('Exciting(Tense)')
        if i in ekman_mapping['Funny']:
            map_list.append('Funny')
        if i in ekman_mapping['Groovy']:
            map_list.append('Groovy')
        if i in ekman_mapping['Peaceful']:
            map_list.append('Peaceful')
        if i in ekman_mapping['Romantic']:
            map_list.append('Romantic')
        if i in ekman_mapping["Scary/Creepy/Mysterious"]:
            map_list.append("Scary/Creepy/Mysterious")
        if i in ekman_mapping['Touching']:
            map_list.append('Touching')
        if i in ekman_mapping['Uplifting/Hopeful']:
            map_list.append('Uplifting/Hopeful')
        if i == 'neutral':
            map_list.append('neutral')
            
    return map_list

In [289]:
df_train['Mapped Emotions'] = df_train['Emotions'].apply(EmotionMapping)
df_dev['Mapped Emotions'] = df_dev['Emotions'].apply(EmotionMapping)

In [290]:
ekman_mapping = {
    "Chill" : ["pride"],
    "Exciting(Tense)" : ["excitement", "joy"],
    "Funny" : ["surprise", "curiosity", "amusement"] ,
    "Groovy" : ["desire", "gratitude"],
    "Peaceful" : ["caring", "relief"],
    "Romantic" : ["love"],
    "Scary/Creepy/Mysterious" : ["fear", "nervousness", "anger", "annoyance", "disapproval", "disgust"],
    "Touching" : ["sadness", "disappointment", "remorse"],
    "Uplifting/Hopeful" : ["admiration", "grief", "optimism"]
}

In [291]:
df_train['Chill'] = np.zeros((len(df_train),1))
df_train['Exciting(Tense)'] = np.zeros((len(df_train),1))
df_train['Funny'] = np.zeros((len(df_train),1))
df_train['Groovy'] = np.zeros((len(df_train),1))
df_train['Peaceful'] = np.zeros((len(df_train),1))
df_train['Romantic'] = np.zeros((len(df_train),1))
df_train['Scary/Creepy/Mysterious'] = np.zeros((len(df_train),1))
df_train['Touching'] = np.zeros((len(df_train),1))
df_train['Uplifting/Hopeful'] = np.zeros((len(df_train),1))

df_dev['Chill'] = np.zeros((len(df_dev),1))
df_dev['Exciting(Tense)'] = np.zeros((len(df_dev),1))
df_dev['Funny'] = np.zeros((len(df_dev),1))
df_dev['Groovy'] = np.zeros((len(df_dev),1))
df_dev['Peaceful'] = np.zeros((len(df_dev),1))
df_dev['Romantic'] = np.zeros((len(df_dev),1))
df_dev['Scary/Creepy/Mysterious'] = np.zeros((len(df_dev),1))
df_dev['Touching'] = np.zeros((len(df_dev),1))
df_dev['Uplifting/Hopeful'] = np.zeros((len(df_dev),1))

In [292]:
for i in ['Chill', 'Exciting(Tense)', 'Funny', 'Groovy', 'Peaceful', 'Romantic','Scary/Creepy/Mysterious', 'Touching', 'Uplifting/Hopeful']:
    df_train[i] = df_train['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)
    df_dev[i] = df_dev['Mapped Emotions'].apply(lambda x: 1 if i in x else 0)

In [293]:
df_train.drop(['Class', 'List of classes', 'Len of classes', 'Emotions', 'Mapped Emotions'], axis=1, inplace=True)
df_dev.drop(['Class', 'List of classes', 'Len of classes', 'Emotions', 'Mapped Emotions'], axis=1, inplace=True)

In [221]:
# df_train.drop(df_train[df_train['neutral'] == 1].index, inplace=True)
# df_dev.drop(df_dev[df_dev['neutral'] == 1].index, inplace=True)
# df_train.drop(df_train[df_train['disgust'] == 1].index, inplace=True)
# df_dev.drop(df_dev[df_dev['disgust'] == 1].index, inplace=True)

In [294]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", 
                       "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", 
                       "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 
                       "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am",
                       "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", 
                       "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have",
                       "it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not",
                       "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", 
                       "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                       "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                       "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
                       "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is",
                       "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would",
                       "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have",
                       "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
                       "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 
                       "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
                       "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", 
                       "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", 
                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                       "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have",
                       "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'u.s':'america', 'e.g':'for example'}

punct = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-",
                 "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 
                 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', '!':' '}

mispell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater',
                'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ',
                'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can',
                'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 
                'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 
                'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 
                'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization',
                'demonetisation': 'demonetization'}

In [295]:
def clean_text(text):
    '''Clean emoji, Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = emoji.demojize(text)
    text = re.sub(r'\:(.*?)\:','',text)
    text = str(text).lower()    #Making Text Lowercase
    text = re.sub('\[.*?\]', '', text)
    #The next 2 lines remove html text
    # text = BeautifulSoup(text, 'lxml').get_text()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",", "'")
    text = re.sub(r"[^a-zA-Z?.!,¿']+", " ", text)
    return text

def clean_contractions(text, mapping):
    '''Clean contraction using contraction mapping'''    
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    for word in mapping.keys():
        if ""+word+"" in text:
            text = text.replace(""+word+"", ""+mapping[word]+"")
    #Remove Punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    return text

def clean_special_chars(text, punct, mapping):
    '''Cleans special characters present(if any)'''   
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def correct_spelling(x, dic):
    '''Corrects common spelling errors'''   
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def remove_space(text):
    '''Removes awkward spaces'''   
    #Removes awkward spaces 
    text = text.strip()
    text = text.split()
    return " ".join(text)

def text_preprocessing_pipeline(text):
    '''Cleaning and parsing the text.'''
    text = clean_text(text)
    text = clean_contractions(text, contraction_mapping)
    text = clean_special_chars(text, punct, punct_mapping)
    text = correct_spelling(text, mispell_dict)
    text = remove_space(text)
    return text

In [296]:
df_train['Text'] = df_train['Text'].apply(text_preprocessing_pipeline)
df_dev['Text'] = df_dev['Text'].apply(text_preprocessing_pipeline)

In [297]:
df_train['sum'] = df_train['Chill'] + df_train['Exciting(Tense)'] + df_train['Funny'] + df_train['Groovy'] + df_train['Peaceful'] + df_train['Romantic'] + df_train['Scary/Creepy/Mysterious'] + df_train['Touching'] + df_train['Uplifting/Hopeful']
df_dev['sum'] = df_dev['Chill'] + df_dev['Exciting(Tense)'] + df_dev['Funny'] + df_dev['Groovy'] + df_dev['Peaceful'] + df_dev['Romantic'] + df_dev['Scary/Creepy/Mysterious'] + df_dev['Touching'] + df_dev['Uplifting/Hopeful']

In [28]:
"""indexNames_train = df_train[df_train['sum'] == 0].index
indexNames_dev = df_dev[df_dev['sum'] == 0].index

df_train.drop(indexNames_train,inplace=True)
df_dev.drop(indexNames_dev,inplace=True)"""

In [298]:
# tmp

indexNames_train = df_train[df_train['sum'] == 1].index
indexNames_dev = df_dev[df_dev['sum'] == 1].index

df_train = df_train.iloc[indexNames_train]
df_dev= df_dev.iloc[indexNames_dev]

In [299]:
def label_df(df, columns):
    df_value = df.values[2:-1]

    return columns[np.argmax(df_value)]

In [300]:
labels = []
for i in range(len(df_train)):
    df = df_train.iloc[i]
    col = label_df(df,df_train.columns[2:-1])
    labels.append(col)

In [301]:
val_labels = []
for i in range(len(df_dev)):
    df = df_dev.iloc[i]
    col = label_df(df,df_dev.columns[2:-1])
    val_labels.append(col)

In [302]:
df_train.drop(np.unique(np.array(labels)),axis=1,inplace=True)
df_train.drop(['ID','sum'],axis=1,inplace=True)

df_dev.drop(np.unique(np.array(labels)),axis=1,inplace=True)
df_dev.drop(['ID','sum'],axis=1,inplace=True)

In [303]:
df_train = df_train.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)

In [304]:
train_label = pd.DataFrame(labels,columns=['label'])
valid_label = pd.DataFrame(val_labels,columns=['label'])

In [305]:
df_train = pd.concat([df_train,train_label],axis=1)

df_dev = pd.concat([df_dev,valid_label],axis=1)

In [306]:
df_scary = df_train[df_train['label']=="Scary/Creepy/Mysterious"]
df_scary = df_scary.sample(n=850)
df_scary.reset_index(drop=True)

df_fun = df_train[df_train['label']=="Funny"]
df_fun = df_fun.sample(n=850)
df_fun.reset_index(drop=True)

df_hope = df_train[df_train['label']=="Uplifting/Hopeful"]
df_hope = df_hope.sample(n=850)
df_hope.reset_index(drop=True)

df_groovy = df_train[df_train['label']=="Groovy"]
df_groovy = df_groovy.sample(n=850)
df_groovy.reset_index(drop=True)

df_touch = df_train[df_train['label']=="Touching"]
df_touch = df_touch.sample(n=850)
df_touch.reset_index(drop=True)

df_excite = df_train[df_train['label']=="Exciting(Tense)"]
df_excite = df_excite.sample(n=850)
df_excite.reset_index(drop=True)

df_romance = df_train[df_train['label']=="Romantic"]
df_romance = df_romance.sample(n=850)
df_romance.reset_index(drop=True)

df_peace = df_train[df_train['label']=="Peaceful"]
df_peace = df_peace.sample(n=850)
df_peace.reset_index(drop=True)

df_chill = df_train[df_train['label']=="Chill"]
df_chill.reset_index(drop=True)

Unnamed: 0,Text,label
0,i am just like this glad to know i m not imagi...,Chill
1,so proud our sub is leaking into facebook,Chill
2,i have never been so proud of humanity,Chill
3,still won the race though ha who s a failure now,Chill
4,my man,Chill
...,...,...
56,i am so proud of you,Chill
57,because we re fucking geniuses,Chill
58,it is my masterpiece,Chill
59,of course i love myself because i am awesome,Chill


In [307]:
df_scary_val = df_dev[df_dev['label']=="Scary/Creepy/Mysterious"]
df_scary_val = df_scary.sample(n=850)
df_scary_val.reset_index(drop=True)

df_fun_val = df_dev[df_dev['label']=="Funny"]
df_fun_val = df_fun.sample(n=850)
df_fun_val.reset_index(drop=True)

df_hope_val = df_dev[df_dev['label']=="Uplifting/Hopeful"]
df_hope_val = df_hope.sample(n=850)
df_hope_val.reset_index(drop=True)

df_groovy_val = df_dev[df_dev['label']=="Groovy"]
df_groovy_val = df_groovy.sample(n=850)
df_groovy_val.reset_index(drop=True)

df_touch_val = df_dev[df_dev['label']=="Touching"]
df_touch_val = df_touch.sample(n=850)
df_touch_val.reset_index(drop=True)

df_excite_val = df_dev[df_dev['label']=="Exciting(Tense)"]
df_excite_val = df_excite.sample(n=850)
df_excite_val.reset_index(drop=True)

df_romance_val = df_dev[df_dev['label']=="Romantic"]
df_romance_val = df_romance.sample(n=850)
df_romance_val.reset_index(drop=True)

df_peace_val = df_dev[df_dev['label']=="Peaceful"]
df_peace_val = df_peace.sample(n=850)
df_peace_val.reset_index(drop=True)

df_chill_val = df_dev[df_dev['label']=="Chill"]
df_chill_val.reset_index(drop=True)

Unnamed: 0,Text,label
0,i had a monster panic attack myself and also m...,Chill
1,i am and i have slept with dozens of girls,Chill
2,he is he will make the browns proud,Chill
3,we play some sexy football at times,Chill
4,i feel truly honored that you gave up the jubi...,Chill
5,you cannot arrest me i am attractive,Chill
6,that guy his dad was so proud he threw that st...,Chill
7,i m proud of you for moving out you deserve pe...,Chill
8,everyone has their own preferences i happen to...,Chill
9,that is a proud dad,Chill


In [308]:
df_train = pd.concat([df_hope, df_touch, df_scary,
                      df_romance, df_peace, df_fun,
                      df_groovy,df_excite, df_chill], axis=0)

In [309]:
df_dev = pd.concat([df_hope_val, df_touch_val, df_scary_val,
                      df_romance_val, df_peace_val, df_fun_val,
                      df_groovy_val,df_excite_val, df_chill_val], axis=0)

In [310]:
df_train.reset_index(drop=True).to_csv('/home/keonwoo/anaconda3/envs/bgmRS/data/eng/eng_train.csv', index=False)
df_dev.reset_index(drop=True).to_csv('/home/keonwoo/anaconda3/envs/bgmRS/data/eng/eng_val.csv', index=False)