<a href="https://colab.research.google.com/github/glitch-y/CE888-Project/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Modules

In [270]:
#Import modules
import html
import numpy as np
import pandas as pd
import contractions
import emot
import re

# Import Data

In [298]:
task='emotion'

#Import test data for the 'Emotion' task
data = pd.read_csv(f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/test_text.txt", delimiter='\t', dtype=str, header= None)

#Assign column name
data.columns =['content']

#Explore data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1421 entries, 0 to 1420
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  1421 non-null   object
dtypes: object(1)
memory usage: 11.2+ KB


# Preprocessing

Replace misspelled words

In [299]:
#Import misspelling data as dictionary
misspell_data = pd.read_csv("https://raw.githubusercontent.com/glitch-y/CE888-Project/main/Misspelling.txt",sep=":",names=["correction","misspell"])
misspell_data.misspell = misspell_data.misspell.str.strip()
misspell_data.misspell = misspell_data.misspell.str.split(" ")
misspell_data = misspell_data.explode("misspell").reset_index(drop=True)
misspell_data.drop_duplicates("misspell",inplace=True)
miss_corr = dict(zip(misspell_data.misspell, misspell_data.correction))

#Preview misspelling dictionary
{v:miss_corr[v] for v in [list(miss_corr.keys())[k] for k in range(10)]}


{'Steffen': 'Stephen',
 'abilitey': 'ability',
 'abouy': 'about',
 'absorbtion': 'absorption',
 'accidently': 'accidentally',
 'accomodate': 'accommodate',
 'nevade': 'Nevada',
 'presbyterian': 'Presbyterian',
 'rsx': 'RSX',
 'susan': 'Susan'}

In [300]:
#Create misspelling correction function
def misspelled_correction(x):
    for i in x.split(): 
        if i in miss_corr.keys(): 
            x = x.replace(i, miss_corr[i]) 
    return x

#Apply misspelling correction to dataframe as new column
data['content_clean'] = data.content.apply(lambda x : misspelled_correction(x))

In [274]:
#Abbreviated chat words conversion
#Create Dictionary
chat_dictionary = pd.read_csv("https://raw.githubusercontent.com/glitch-y/CE888-Project/main/SlangDictionary.csv",dtype=str, names=["Slang", "Translation"])
chat_dictionary=chat_dictionary.apply(lambda x: x.str.lower())
slang_corr = dict(zip(chat_dictionary.Slang, chat_dictionary.Translation))

#Preview abbreviation dictionary
{v:slang_corr[v] for v in [list(slang_corr.keys())[k] for k in range(10)]}

{'afaik': 'as far as i know',
 'afk': 'away from keyboard',
 'ama': 'ask me anything',
 'asap': 'as soon as possible',
 'atk': 'at the keyboard',
 'atm': 'at the moment',
 'bak': 'back at keyboard',
 'bbl': 'be back later',
 'bbs': 'be back soon',
 'bc': ' because'}

In [307]:
#Create abbreviation replacement function
def abbrev_replace(x):
    for i in x.split(): 
        if i in slang_corr.keys(): 
            x = x.replace(i, slang_corr[i]) 
    return x

#Apply misspelling correction to dataframe as new column
data['content_clean'] = data.content.apply(lambda x : abbrev_replace(x))

#Check
print(data.loc[[13]])
print(data.loc[[682]])


NameError: ignored

In [308]:
#clean HTML charachters such as &amp;, &lt; etc using 'html' module
data.content_clean = data.content_clean.apply(lambda x: html.unescape(x))

#Check
print(data.loc[[12]])
data.head()

                                              content                                      content_clean
12  Yes #depression &amp; #anxiety are real but so...  Yes #depression & #anxiety are real but so is ...


Unnamed: 0,content,content_clean
0,#Deppression is real. Partners w/ #depressed p...,#Deppression is real. Partners with #depresse...
1,@user Interesting choice of words... Are you c...,@user Interesting choice of words... Are you c...
2,My visit to hospital for care triggered #traum...,My visit to hospital for care triggered #traum...
3,@user Welcome to #MPSVT! We are delighted to h...,@user Welcome to #MPSVT! We are delighted to h...
4,What makes you feel #joyful?,What makes you feel #joyful?


In [309]:
#fix contractions; i.e. 'It's' transforms into 'it is'
data.content_clean = data.content_clean.apply(lambda x: contractions.fix(x))
print(data.loc[[13]])
print(data.loc[[54]])
data.head()

                                              content                                      content_clean
13  People who say nmu are the worst, something ha...  People who say not much you? are the worst, so...
                                              content                                      content_clean
54  Let's start all over again.....\n#feels #lover...  let us start all over again.....\n#feels #love...


Unnamed: 0,content,content_clean
0,#Deppression is real. Partners w/ #depressed p...,#Deppression is real. Partners with #depresse...
1,@user Interesting choice of words... Are you c...,@user Interesting choice of words... Are you c...
2,My visit to hospital for care triggered #traum...,My visit to hospital for care triggered #traum...
3,@user Welcome to #MPSVT! We are delighted to h...,@user Welcome to #MPSVT! We are delighted to h...
4,What makes you feel #joyful?,What makes you feel #joyful?


In [310]:
#remove newlines from data and replace '&' with 'and'
data['content_clean'] = data['content_clean'].replace(r'\\n',' ', regex=True)
data['content_clean'] = data['content_clean'].replace(r'&','and', regex=True)
print(data.loc[[34]])
print(data.loc[[12]])
print(data.loc[[326]])
data.head()

                                              content                                      content_clean
34  @user -- can handle myself.\n[Carl yelled back...  @user -- can handle myself. [Carl yelled back ...
                                              content                                      content_clean
12  Yes #depression &amp; #anxiety are real but so...  Yes #depression and #anxiety are real but so i...
                                               content                                      content_clean
326  Why is @user 'busted' bc he spoke w/Russian la...  Why is @user 'busted'  because he spoke w/Russ...


Unnamed: 0,content,content_clean
0,#Deppression is real. Partners w/ #depressed p...,#Deppression is real. Partners with #depresse...
1,@user Interesting choice of words... Are you c...,@user Interesting choice of words... Are you c...
2,My visit to hospital for care triggered #traum...,My visit to hospital for care triggered #traum...
3,@user Welcome to #MPSVT! We are delighted to h...,@user Welcome to #MPSVT! We are delighted to h...
4,What makes you feel #joyful?,What makes you feel #joyful?


In [311]:
#convert emojis into text
def convert_emojis(x):
    for emot in UNICODE_EMO:
        x = x.replace(emot, "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()))
    return x

data.content_clean = data.content_clean.apply(lambda x: convert_emojis(x))
print(data.loc[[105]])
print(data.loc[[0]])
data.head()

                     content                                      content_clean
105  @user Wise you mean? 😅   @user Wise you mean? smiling_face_with_open_mo...
                                             content                                      content_clean
0  #Deppression is real. Partners w/ #depressed p...  #Deppression is real. Partners with  #depresse...


Unnamed: 0,content,content_clean
0,#Deppression is real. Partners w/ #depressed p...,#Deppression is real. Partners with #depresse...
1,@user Interesting choice of words... Are you c...,@user Interesting choice of words... Are you c...
2,My visit to hospital for care triggered #traum...,My visit to hospital for care triggered #traum...
3,@user Welcome to #MPSVT! We are delighted to h...,@user Welcome to #MPSVT! We are delighted to h...
4,What makes you feel #joyful?,What makes you feel #joyful?


In [312]:
#Remove punctuation
def punctuation(x): 
  
    punctuations = '''()-[]{};:'"\<>/#$%^&_~'''
  
    for i in x.lower(): 
        if i in punctuations: 
            x = x.replace(i, " ") 
    return x

data.content_clean = data.content_clean.apply(lambda x: punctuation(x))
data.head()

Unnamed: 0,content,content_clean
0,#Deppression is real. Partners w/ #depressed p...,Deppression is real. Partners with depresse...
1,@user Interesting choice of words... Are you c...,@user Interesting choice of words... Are you c...
2,My visit to hospital for care triggered #traum...,My visit to hospital for care triggered traum...
3,@user Welcome to #MPSVT! We are delighted to h...,@user Welcome to MPSVT! We are delighted to h...
4,What makes you feel #joyful?,What makes you feel joyful?


In [313]:
print(data.loc[[105]])

                     content                                      content_clean
105  @user Wise you mean? 😅   @user Wise you mean? smiling face with open mo...


In [314]:
processed_data = data.content_clean
processed_data.head(15)

0      Deppression is real. Partners with   depresse...
1     @user Interesting choice of words... Are you c...
2     My visit to hospital for care triggered  traum...
3     @user Welcome to  MPSVT! We are delighted to h...
4                         What makes you feel  joyful? 
5                                      i am revolting. 
6     Rin might ever appeared gloomy but to be a mel...
7                       In need of a change!  restless 
8     @user @user  cmbyn does screen  August 4 and 6...
9     @user Get Donovan out of your soccer booth. he...
10    @youser how can you have sold so many copies b...
11            Pressured. frowning face with open mouth 
12    Yes  depression and  anxiety are real but so i...
13    People who say not much you? are the worst, so...
14    @user The hatred from the Left ought to concer...
Name: content_clean, dtype: object