In [6]:
# in this notebook, I will explore and clean the dataset as needed
import nltk
import pandas as pd
import os.path as path
import re
from nltk.corpus import stopwords



In [7]:
SETS_PATH = "../data/data_preparation_phase"
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/faisalalmasri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/faisalalmasri/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
#step 1: data exploration:
datasets = {'train':pd.read_csv(path.join(SETS_PATH,"train_set.csv")),
            'valid':pd.read_csv(path.join(SETS_PATH,"valid_set.csv")),
            'test':pd.read_csv(path.join(SETS_PATH,"test_set.csv"))}

In [9]:
#number of samples per class:
for s in datasets:
    print(datasets[s]['sentiment'].value_counts())
    #great all s are balanced!

sentiment
1    3643
0    3618
Name: count, dtype: int64
sentiment
1    784
0    772
Name: count, dtype: int64
sentiment
1    782
0    775
Name: count, dtype: int64


In [10]:
longest = datasets['train']['content'].apply(len).idxmax()

In [11]:
len(datasets['train']['content'].iloc[longest])

161

In [12]:
#let's see the shortest longest and avg number of chars per sample in addition to what they are:

for s in datasets:
    samples_lengths = datasets[s]['content'].apply(len)

    shortest_content = datasets[s]['content'].iloc[samples_lengths.idxmin()]
    longest_content = datasets[s]['content'].iloc[samples_lengths.idxmax()]
    avg_content = samples_lengths.mean()
    median_length_content = samples_lengths.median()


    print(f"{s}_set:")
    print(f"shortest sample length: {len(shortest_content)}\ncontent:{shortest_content}\n")
    print(f"longest sample length: {len(longest_content)}\ncontent:{longest_content}\n")
    print(f"avg number of chars per sample: {avg_content}")
    print(f"median number of chars per sample: {median_length_content}")





    #since mean and median almost the same we could say that it's symmetrical distribution. 


train_set:
shortest sample length: 6
content:lonely

longest sample length: 161
content:@JasonVonBerg thats the crazy part - was for 16:30. but reasons include &quot;full schedule&quot; &amp; &quot;travelling with his daughter&quot;... So. Not. Cool

avg number of chars per sample: 75.2539595097094
median number of chars per sample: 71.0
valid_set:
shortest sample length: 7
content:@bfly13

longest sample length: 157
content:Bored...um...Song of the day: &quot;I caught myself by Paramore&quot; and &quot;No good deed from the musical Wicked&quot;  Oh! And Fully Alive by Flyleaf!!!

avg number of chars per sample: 76.00964010282776
median number of chars per sample: 72.0
test_set:
shortest sample length: 8
content:Headache

longest sample length: 153
content:@MariahCarey &quot;VISION OF LOVE&quot; Play now &quot;i had the vision of luv n it was aaaaaoOUoOuoOUU that u..turn out..toOoo&quot; Mariah this ur part

avg number of chars per sample: 74.79961464354528
median number of chars per s

In [13]:
#step 2: pre_processing pipeline:

def lower_case(text):
    return text.lower()

#why we did that? since emotions(happiness,sadness) are not usually correlated with this kind of emotions, likewise, symbols like !? could be used to represent angriness suspesion
def remove_puncitions(text):
    return re.sub(r'[^\w\s]',' ',text)

#numbers will not contribute to the emotions usually unless something like "I waited 30 years to get that. finally!"
def remove_numbers(text):
    return re.sub(r'[0-9]','',text)

#just did it eariler for tokenization later
def trim_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()


def remove_stopwords(text,stopwords=stopwords.words('english')):
    return " ".join(word for word in text.split(" ") if word not in stopwords)

def remove_links(text):
    return re.sub(r'\b(?:https?://|www\.)?\S+\.\S+\b', '', text)

def remove_repititve_patterns(text):
    # Step 1: Replace repeated characters (3+ times)
    text = re.sub(r'(.)\1{2,}', r'\1', text)
    
    # Step 2: Replace repeated words (2+ times)
    text = re.sub(r'\b(\w+)(?:\s+\1)+\b', r'\1', text)
    
    # Step 3: Replace repeated patterns (2+ times)
    text = re.sub(r'\b(.+?)\1+\b', r'\1', text)
    
    return text

In [14]:
#display stopwords:
stopwords.words('english')
#I will keep them as they are for now and change them later if needed after model development

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [15]:
def preprocessing(content,functions):
    for fun in functions:
        content = fun(content)
    return content


In [17]:
preprocessing_functions = [lower_case,remove_puncitions,remove_numbers,trim_extra_spaces,remove_stopwords,remove_links,remove_repititve_patterns,nltk.casual.casual_tokenize]
# preprocessing("    hello! my naaAAaame is faisal@aLmasri. my id is 12321s11122   ",preprocessing_functions)

In [18]:
#let's tokenize all the sets and save them:
for s in datasets:
   datasets[s]['content'] = datasets[s]['content'].apply(lambda x: preprocessing(x,preprocessing_functions))

In [19]:
#let's check some samples of each and save results:
for s in datasets:
    print(f"{s}_set:\n{datasets[s].head()}")
    datasets[s].to_csv(f"../data/preprocessing_phase/{s}_set.csv")

train_set:
   sentiment                                            content
0          1      [english, class, working, interactive, orals]
1          0  [claireyjonesy, lmao, want, reply, bo, tommcfl...
2          0                      [everything, annoying, today]
3          0  [done, geology, really, missing, favorite, sis...
4          0  [kurttheobald, problem, unfortunately, think, ...
valid_set:
   sentiment                                            content
0          1           [nick_carter, aw, poor, know, us, bless]
1          0                  [phone, still, blocked, fav, cll]
2          0                                      [like, dress]
3          1  [alright, need, get, sleep, actually, awake, m...
4          0  [glennbeck, husband, loosing, goverment, job, ...
test_set:
   sentiment                                            content
0          0  [left, cali, morning, dallas, missing, car, much]
1          0  [sparklethots, love, birdy, nest, though, alre...
2       