### Read Data

In [160]:
#pip install langdetect

In [5]:
import pandas as pd
from langdetect import detect, LangDetectException

In [13]:
# language detection
def detect_lang(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

df_texts = pd.read_csv('data/labeled_texts.csv', encoding='utf-8-sig')
df_texts = df_texts['Content']
df_texts.dropna(inplace=True)
df_texts = pd.DataFrame(df_texts)

# detect language and add a new column
df_texts['lang'] = df_texts['Content'].apply(detect_lang)

# remove French texts
df_texts = df_texts[df_texts['lang'] == 'en']['Content']

df_texts

1     In 2013, this dude on YouTube begged everyone ...
2     Christopher Nolan planted 500 acres of corn fo...
3     the one who was the last survivor of genocide ...
4     Kai Cenat room setup for his 50+ hour Red Dead...
5     Reminder ...\r\n\r\nThis tweet is about nothin...
6            There is nothing Catholic about Joe Biden.
7     Your child doesn't have ADHD, hes just been ea...
8     SHOCK REPORT:  Biden declares Easter Sunday 20...
9     #BREAKING: A large barge crashed into the Arka...
10             i thought they already seen the original
11    BIDEN RENAMES EASTER SUNDAY “TRANSGENDER DAY O...
12    Wemby spoiled Jalen Brunson’s 61pt game then t...
13    The last time Larry David stretched out his le...
14                 Interesting method to poach an egg. 
16                   that cat took off like scooby doo 
17                                         Cost of love
18         How these vehicles are painted  ohmytechness
19    Chance Perdomo has passed away at the age 

In [15]:
texts = [text for text in df_texts]
print(texts)
len(texts)

['In 2013, this dude on YouTube begged everyone to buy just $1 worth of bitcoin', 'Christopher Nolan planted 500 acres of corn for Interstellar rather than use CGI.', 'the one who was the last survivor of genocide of his people', 'Kai Cenat room setup for his 50+ hour Red Dead Redemption 2 stream ', "Reminder ...\r\n\r\nThis tweet is about nothing but it'll still get more views than Don Lemon's show. ", 'There is nothing Catholic about Joe Biden.', "Your child doesn't have ADHD, hes just been eating artificial food coloring for breakfast the last 6 years of his life", 'SHOCK REPORT:  Biden declares Easter Sunday 2024 as "TRANSGENDER DAY OF VISIBILITY" \r\n\r\nThis is like declaring \'NATIONAL BACON WEEK\' during the Muslim Ramadan..', '#BREAKING: A large barge crashed into the Arkansas River bridge, causing damage to both the ship and the bridge pillar #Sallisaw | #OklahomaEarlier this evening, just before 1:30 p.m., a large barge crashed into and struck the Arkansas River bridge at th

44

### Text Preprocessing

In [None]:
#pip install tweet-preprocessor

In [46]:
# importing the needed libraries
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim.parsing.preprocessing import remove_stopwords
import re 
import string
import html
import preprocessor as p

In [50]:
def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    elif tag.startswith('V'):
        return 'v'
    else:
        return None

In [55]:
def clean_tweet(tweet):
    # remove URLs, mentions, reserved words (RT, FAV), emojis, and smileys
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY)
    cleaned_tweet = p.clean(tweet)

    # Replacing '-' with ' '
    sentence_cleaned_v1 = re.sub(r'-', ' ', cleaned_tweet)

    # Removing any character which is not a space, letter or a number
    regular_expression_num_letters = r'[^a-zA-Z0-9 ]'
    sentence_cleaned_v2 = re.sub(regular_expression_num_letters, '', sentence_cleaned_v1)

    # Removing any extra spaces
    sentence_cleaned_v3 = re.sub(r'\s+', ' ', sentence_cleaned_v2)

    # Remove Stopwords
    stop_words_removed = remove_stopwords(sentence_cleaned_v3)

    # Tokenize the tweet
    tokens = word_tokenize(stop_words_removed)

    # handle abbreviations
    tokens_normalized = ["Computer-generated Imagery" if token == "CGI" else token for token in tokens]

    # POS tagging
    pos_tagged = pos_tag(tokens_normalized)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = [word if get_wordnet_tag(tag) is None else lemmatizer.lemmatize(word, get_wordnet_tag(tag)) for word, tag in pos_tagged]

    result = lemmatized_sentence
    
    return result

In [57]:
def get_cleaned_tweets(tweets):
    return [clean_tweet(tweet) for tweet in tweets]

In [58]:
result = get_cleaned_tweets(texts)
result

[['In', '2013', 'dude', 'YouTube', 'beg', 'buy', '1', 'worth', 'bitcoin'],
 ['Christopher',
  'Nolan',
  'plant',
  '500',
  'acre',
  'corn',
  'Interstellar',
  'use',
  'Computer-generated Imagery'],
 ['survivor', 'genocide', 'people'],
 ['Kai',
  'Cenat',
  'room',
  'setup',
  '50',
  'hour',
  'Red',
  'Dead',
  'Redemption',
  '2',
  'stream'],
 ['Reminder', 'This', 'tweet', 'itll', 'view', 'Don', 'Lemons'],
 ['There', 'Catholic', 'Joe', 'Biden'],
 ['Your',
  'child',
  'doesnt',
  'ADHD',
  'hes',
  'eat',
  'artificial',
  'food',
  'color',
  'breakfast',
  '6',
  'year',
  'life'],
 ['SHOCK',
  'REPORT',
  'Biden',
  'declare',
  'Easter',
  'Sunday',
  '2024',
  'TRANSGENDER',
  'DAY',
  'OF',
  'VISIBILITY',
  'This',
  'like',
  'declare',
  'NATIONAL',
  'BACON',
  'WEEK',
  'Muslim',
  'Ramadan'],
 ['BREAKING',
  'A',
  'large',
  'barge',
  'crash',
  'Arkansas',
  'River',
  'bridge',
  'cause',
  'damage',
  'ship',
  'bridge',
  'pillar',
  'Sallisaw',
  'OklahomaEa

### Save Output

In [59]:
import pickle

# Save the dictionary to a file using Pickle
with open('data/preprocessing_output.pkl', 'wb') as file:
    pickle.dump(result, file)

### Load Output

In [60]:
# to read the pkl file:
with open('data/preprocessing_output.pkl', 'rb') as file:
    loaded_list = pickle.load(file)
    
# Print the dictionary
print(loaded_list)

[['In', '2013', 'dude', 'YouTube', 'beg', 'buy', '1', 'worth', 'bitcoin'], ['Christopher', 'Nolan', 'plant', '500', 'acre', 'corn', 'Interstellar', 'use', 'Computer-generated Imagery'], ['survivor', 'genocide', 'people'], ['Kai', 'Cenat', 'room', 'setup', '50', 'hour', 'Red', 'Dead', 'Redemption', '2', 'stream'], ['Reminder', 'This', 'tweet', 'itll', 'view', 'Don', 'Lemons'], ['There', 'Catholic', 'Joe', 'Biden'], ['Your', 'child', 'doesnt', 'ADHD', 'hes', 'eat', 'artificial', 'food', 'color', 'breakfast', '6', 'year', 'life'], ['SHOCK', 'REPORT', 'Biden', 'declare', 'Easter', 'Sunday', '2024', 'TRANSGENDER', 'DAY', 'OF', 'VISIBILITY', 'This', 'like', 'declare', 'NATIONAL', 'BACON', 'WEEK', 'Muslim', 'Ramadan'], ['BREAKING', 'A', 'large', 'barge', 'crash', 'Arkansas', 'River', 'bridge', 'cause', 'damage', 'ship', 'bridge', 'pillar', 'Sallisaw', 'OklahomaEarlier', 'even', '130', 'pm', 'large', 'barge', 'crash', 'struck', 'Arkansas', 'River', 'bridge', 'Kerr', 'Reservoir'], ['think', 'se