### Read Data

In [160]:
#pip install langdetect

In [163]:
import pandas as pd
from langdetect import detect, LangDetectException

In [175]:
# language detection
def detect_lang(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

df_texts = pd.read_csv('data/texts.csv', encoding='cp949')
df_texts = df_texts['Content']
df_texts.dropna(inplace=True)
df_texts = pd.DataFrame(df_texts)

# detect language and add a new column
df_texts['lang'] = df_texts['Content'].apply(detect_lang)

# remove French texts
df_texts = df_texts[df_texts['lang'] != 'fr']['Content']

df_texts

0      There’s renters of every age in this country. ...
1      Young Canadians put a lot of their hard-earned...
2                             i wonder where she is now 
3      The Canada Carbon Rebate puts hundreds of doll...
4      The economy should be recognizing and rewardin...
                             ...                        
495                                          EXCUSE ME?!
496                                    It's over, folks.
497    First peek at the Nostalgia emotion in ‘INSIDE...
498    It is difficult, but I would like to stop usin...
499    I’m writing something about the famous field o...
Name: Content, Length: 422, dtype: object

In [176]:
texts = [text for text in df_texts]
print(texts)
len(texts)

['There’s renters of every age in this country. Our new renter protections ? against unfair rent hikes, renovictions, and bad landlords ? will be there for every one of you.', 'Young Canadians put a lot of their hard-earned money towards rent every month. We think that should count for a lot more ? like towards your credit score.', 'i wonder where she is now ', 'The Canada Carbon Rebate puts hundreds of dollars back in Canadians’ pockets every quarter. Oddly, some Premiers are against that ? and they want to scrap your rebates. \r\n\r\nHere’s what I had to say about that:', 'The economy should be recognizing and rewarding the work of young Canadians ? just as it did for their parents and grandparents.\r\n\r\nThat’s what this Budget is all about.', 'Don’t think, just type.\r\n\r\nWhat is a GREAT monthly salary.', 'Introducing AnimeChain, in collaboration with  and ', 'how is that even possible???', 'Grok has normal mode and fun mode. Tonight, we decided to add an unhinged fun mode. It i

422

### Text Preprocessing

In [None]:
#pip install tweet-preprocessor

In [202]:
# importing the needed libraries
import numpy as np
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from gensim.parsing.preprocessing import remove_stopwords
import re 
import string
import html
import preprocessor as p

In [222]:
def clean_tweet(tweet):
    # remove URLs, mentions, reserved words (RT, FAV), emojis, and smileys
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED, p.OPT.EMOJI, p.OPT.SMILEY)
    tweet = p.clean(tweet)

    # Tokenize the tweet
    tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = tknzr.tokenize(tweet)

    # handle abbreviations
    tokens_normalized = ["to be honest" if token == "tbh" else token for token in tokens]

    # join words back into sentences
    cleaned_sentence = [' '.join(tokens_normalized)]

    # Replacing '-' with ' '
    sentence_cleaned_v1 = re.sub(r'-', ' ', cleaned_sentence[0])

    # Removing any character which is not a space, letter or a number
    regular_expression_num_letters = r'[^a-zA-Z0-9 ]'
    sentence_cleaned_v2 = re.sub(regular_expression_num_letters, '', sentence_cleaned_v1)

    # Removing any extra spaces
    sentence_cleaned_v3 = re.sub(r'\s+', ' ', sentence_cleaned_v2)

    # Remove Stopwords
    stop_words_removed = remove_stopwords(sentence_cleaned_v3)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = [lemmatizer.lemmatize(word) for word in stop_words_removed.split()]

    result = lemmatized_sentence
    
    return result

In [223]:
def get_cleaned_tweets(tweets):
    return [clean_tweet(tweet) for tweet in tweets]

In [224]:
result = get_cleaned_tweets(texts)
result

[['Theres',
  'renter',
  'age',
  'country',
  'Our',
  'new',
  'renter',
  'protection',
  'unfair',
  'rent',
  'hike',
  'renovictions',
  'bad',
  'landlord'],
 ['Young',
  'Canadians',
  'lot',
  'hard',
  'earned',
  'money',
  'rent',
  'month',
  'We',
  'think',
  'count',
  'lot',
  'like',
  'credit',
  'score'],
 ['wonder'],
 ['The',
  'Canada',
  'Carbon',
  'Rebate',
  'put',
  'hundred',
  'dollar',
  'Canadians',
  'pocket',
  'quarter',
  'Oddly',
  'Premiers',
  'want',
  'scrap',
  'rebate',
  'Heres',
  'I'],
 ['The',
  'economy',
  'recognizing',
  'rewarding',
  'work',
  'young',
  'Canadians',
  'parent',
  'grandparent',
  'Thats',
  'Budget'],
 ['Dont', 'think', 'type', 'What', 'GREAT', 'monthly', 'salary'],
 ['Introducing', 'AnimeChain', 'collaboration'],
 ['possible'],
 ['Grok',
  'normal',
  'mode',
  'fun',
  'mode',
  'Tonight',
  'decided',
  'add',
  'unhinged',
  'fun',
  'mode',
  'It',
  'level'],
 ['softest', 'paw'],
 ['Imagine', 'Harry', 'Potter'