### Read Data

In [95]:
#pip install langdetect

In [96]:
import pandas as pd
from langdetect import detect, LangDetectException

In [97]:
# language detection
def detect_lang(text):
    try:
        return detect(text)
    except LangDetectException:
        return None

df_texts_orig = pd.read_csv('data/labeled_texts_1000.csv', encoding='utf-8-sig')
df_texts_orig.dropna(inplace=True)

# detect language and add a new column
df_texts_orig['lang'] = df_texts_orig['Content'].apply(detect_lang)

# select only English texts
df_eng = df_texts_orig[df_texts_orig['lang'] == 'en'].reset_index(drop=True)
df_texts = df_eng['Content']
df_labels = df_eng['label']

df_labels.to_pickle('data/labels.pkl')

In [98]:
texts = [text for text in df_texts]
print(texts)
len(texts)



843

### Text Preprocessing

In [99]:
#emoji and emoticons detection package for Python
#pip install emot

In [100]:
#emoji package for Python
#pip install emoji 

In [101]:
#tweet preprocessing package for Python
#pip install tweet-preprocessor

In [102]:
#replace keywords in sentences
#pip install flashtext

In [103]:
# importing the needed libraries
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re 
import string
import preprocessor as p
import emoji
from emot.emo_unicode import EMOTICONS_EMO
from flashtext import KeywordProcessor

In [104]:
def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    elif tag.startswith('V'):
        return 'v'
    else:
        return None

In [105]:
def convert_emoticons(text):
    ## formatting
    all_emoji_emoticons = {**EMOTICONS_EMO}
    all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}

    kp_all_emoji_emoticons = KeywordProcessor()
    for k,v in all_emoji_emoticons.items():
        kp_all_emoji_emoticons.add_keyword(k, v)
    output = kp_all_emoji_emoticons.replace_keywords(text)

    return output

In [106]:
def normalize_text(text):
    # handle abbreviations
    normalized_text = re.sub(r'\bfav\b', "favorite", text)
    normalized_text = re.sub(r'\btkt\b', "ticket", normalized_text)
    normalized_text = re.sub(r'\(gm\)', 'good morning', normalized_text)
    
    # remove unnecessary information
    normalized_text = re.sub(r'\([^)]*(via|h/t)[^)]*\)', '', normalized_text)

    # reduce repeated characters
    normalized_text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', normalized_text)
    
    return normalized_text

In [107]:
def clean_characters(text):
    # Replacing special characters with space
    sentence_cleaned = re.sub(r'[-_:]', ' ', text)

    # Removing any character which is not a space, letter or a number
    regular_expression_num_letters = r"[^a-zA-Z0-9 ']"
    sentence_cleaned = re.sub(regular_expression_num_letters, '', sentence_cleaned)

    # Removing any extra spaces
    sentence_cleaned = re.sub(r'\s+', ' ', sentence_cleaned)
    
    return sentence_cleaned

In [108]:
def clean_tweet(tweet):
    # remove URLs, mentions, reserved words (RT, FAV)
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
    cleaned_tweet = p.clean(tweet)

    # remove emojis
    cleaned_tweet = emoji.demojize(cleaned_tweet)
    
    # convert emoticons to words
    cleaned_tweet = convert_emoticons(cleaned_tweet)

    # handle abbreviations
    normalized_text = normalize_text(cleaned_tweet.lower())
    #return normalized_text

    # clean characters
    sentence_cleaned = clean_characters(normalized_text)

    # Tokenize the tweet
    tokens = word_tokenize(sentence_cleaned)
    
    # POS tagging
    pos_tagged = pos_tag(tokens)

    # Remove Stopwords
    stop_words_removed = [word for word in pos_tagged if word[0] not in stopwords.words('english')]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = [word if get_wordnet_tag(tag) is None else lemmatizer.lemmatize(word, get_wordnet_tag(tag)) for word, tag in stop_words_removed]

    result = lemmatized_sentence
    
    return result

In [109]:
def get_cleaned_tweets(tweets):
    return [clean_tweet(tweet) for tweet in tweets]

In [110]:
result = get_cleaned_tweets(texts)
result

[['binance', 'towel', 'come', 'everywhere', 'include', 'breakfast', 'cooking'],
 ['drop',
  'sol',
  'address',
  'make',
  'sure',
  'follow',
  'dont',
  'ask',
  'backhand',
  'index',
  'point',
  'medium',
  'light',
  'skin',
  'tone'],
 ['take', 'one', 'good', 'altcoin', 'change', 'world', 'sparkle'],
 ['disrespectful', 'one', 'top', '3', 'favorite'],
 ['congressional',
  'republican',
  'call',
  'affordable',
  'connectivity',
  'program',
  'wasteful',
  'call',
  'necessary',
  'time',
  'congress',
  'extend',
  '23',
  'million',
  'household',
  'across',
  'america',
  'rely',
  'program',
  'stay',
  'connect'],
 ['donald',
  'trump',
  'wont',
  'american',
  'president',
  'must',
  'refuse',
  'denounce',
  'political',
  'violence',
  'ill',
  'say',
  'trump',
  'wont',
  'political',
  'violence',
  'never',
  'ever',
  'acceptable',
  'america'],
 ['massive', 'bitcoin', 'breakout', 'come', 'ready'],
 ['ok',
  'vote',
  'ill',
  '25',
  'ticket',
  'giveaway',
  '

### Save Output

In [116]:
import pickle

# Save the dictionary to a file using Pickle
with open('data/preprocessing_output.pkl', 'wb') as file:
    pickle.dump(result, file)

# save dataframe as a CSV file
preprocessed_data = [' '.join(document) for document in result]
df_data = pd.DataFrame(preprocessed_data, columns=['Content'])
df_data['Label'] = df_labels
df_data.to_csv('data/preprocessed_data.csv', index=False, encoding='utf-8-sig')


### Load Output

In [117]:
# to read the pkl file:
with open('data/preprocessing_output.pkl', 'rb') as file:
    loaded_list = pickle.load(file)
    
# Print the dictionary
print(loaded_list)
print(len(loaded_list))

# print the labels
df_labels_loaded = pd.read_pickle('data/labels.pkl')
print(df_labels_loaded)
print(len(df_labels_loaded))

# read the CSV file
df_data_loaded = pd.read_csv('data/preprocessed_data.csv', encoding='utf-8-sig')
print(df_data_loaded)

843
0      1
1      1
2      2
3      2
4      1
      ..
838    2
839    2
840    1
841    2
842    0
Name: label, Length: 843, dtype: int64
843
                                               Content  Label
0    binance towel come everywhere include breakfas...      1
1    drop sol address make sure follow dont ask bac...      1
2           take one good altcoin change world sparkle      2
3                     disrespectful one top 3 favorite      2
4    congressional republican call affordable conne...      1
..                                                 ...    ...
838  shakira 's amazing 12th studio album 'las muje...      2
839  news tesla overtook mercedes second large sell...      2
840  ultimate guide festival celebrate latin music ...      1
841  youre invite see rogueport may 23 papermarioth...      2
842                                       alien mother      0

[843 rows x 2 columns]
