### Read Data

In [95]:
!pip install langdetect

In [9]:
import pandas as pd
from langdetect import detect, LangDetectException

In [None]:
## language detection
#def detect_lang(text):
#    try:
#        return detect(text)
#    except LangDetectException:
#        return None
#
#df_texts_orig = pd.read_csv('data/labeled_texts_1000.csv', encoding='utf-8-sig')
#df_texts_orig.dropna(inplace=True)
#
## detect language and add a new column
#df_texts_orig['lang'] = df_texts_orig['Content'].apply(detect_lang)
#
## select only English texts
#df_eng = df_texts_orig[df_texts_orig['lang'] == 'en'].reset_index(drop=True)
#
#df_eng.to_csv('data/labeled_texts_eng.csv', index=False)

### Load Data

In [47]:
df_eng_loaded = pd.read_csv('data/labeled_texts_eng.csv')

df_labels = df_eng_loaded['label']
df_labels.to_pickle('data/labels.pkl')

df_texts = df_eng_loaded['Content']

texts = [text for text in df_texts]
print(texts)
len(texts)



839

### Text Preprocessing

In [99]:
#emoji and emoticons detection package for Python
!pip install emot

In [100]:
#emoji package for Python
!pip install emoji 

In [101]:
#tweet preprocessing package for Python
!pip install tweet-preprocessor

In [102]:
#replace keywords in sentences
!pip install flashtext

In [2]:
# necessary for BERT tokenizer
!pip install transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
# importing the needed libraries
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import re 
import string
import preprocessor as p
import emoji
from emot.emo_unicode import EMOTICONS_EMO
from flashtext import KeywordProcessor
from transformers import BertTokenizer

In [14]:
def get_wordnet_tag(tag):
    if tag.startswith('J'):
        return 'a'
    elif tag.startswith('N'):
        return 'n'
    elif tag.startswith('R'):
        return 'r'
    elif tag.startswith('V'):
        return 'v'
    else:
        return None

In [15]:
def convert_emoticons(text):
    ## formatting
    all_emoji_emoticons = {**EMOTICONS_EMO}
    all_emoji_emoticons = {k:v.replace(":","").replace("_"," ").strip() for k,v in all_emoji_emoticons.items()}

    kp_all_emoji_emoticons = KeywordProcessor()
    for k,v in all_emoji_emoticons.items():
        kp_all_emoji_emoticons.add_keyword(k, v)
    output = kp_all_emoji_emoticons.replace_keywords(text)

    return output

In [16]:
def normalize_text(text):
    # handle abbreviations
    normalized_text = re.sub(r'\bfav\b', "favorite", text)
    normalized_text = re.sub(r'\btkt\b', "ticket", normalized_text)
    normalized_text = re.sub(r'\(gm\)', 'good morning', normalized_text)
    
    # remove unnecessary information
    normalized_text = re.sub(r'\([^)]*(via|h/t)[^)]*\)', '', normalized_text)

    # reduce repeated characters
    normalized_text = re.sub(r'([a-zA-Z])\1{2,}', r'\1\1', normalized_text)
    
    return normalized_text

In [17]:
def clean_characters(text):
    # Replacing special characters with space
    sentence_cleaned = re.sub(r'[-_:]', ' ', text)

    # Removing any character which is not a space, letter or a number
    regular_expression_num_letters = r"[^a-zA-Z0-9 ']"
    sentence_cleaned = re.sub(regular_expression_num_letters, '', sentence_cleaned)

    # Removing any extra spaces
    sentence_cleaned = re.sub(r'\s+', ' ', sentence_cleaned)
    
    return sentence_cleaned

In [56]:
def clean_tweet(tweet, mode='default'):
    # remove URLs, mentions, reserved words (RT, FAV)
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
    cleaned_tweet = p.clean(tweet)

    # remove emojis
    cleaned_tweet = emoji.demojize(cleaned_tweet)
    
    # convert emoticons to words
    cleaned_tweet = convert_emoticons(cleaned_tweet)

    # handle abbreviations
    normalized_text = normalize_text(cleaned_tweet.lower())

    # BERT Tokenizer
    if mode == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        encoded_input = tokenizer.encode(normalized_text, add_special_tokens=True)
        tokens = tokenizer.convert_ids_to_tokens(encoded_input)
        return tokens

    # clean characters
    sentence_cleaned = clean_characters(normalized_text)

    # Tokenize the tweet
    tokens = word_tokenize(sentence_cleaned)
    
    # POS tagging
    pos_tagged = pos_tag(tokens)

    # Remove Stopwords
    stop_words_removed = [word for word in pos_tagged if word[0] not in stopwords.words('english')]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = [word if get_wordnet_tag(tag) is None else lemmatizer.lemmatize(word, get_wordnet_tag(tag)) for word, tag in stop_words_removed]
    
    return lemmatized_sentence

In [57]:
def get_cleaned_tweets(tweets, mode='default'):
    if mode == 'bert':
        result = []
        for tweet in tweets:
            tokens = clean_tweet(tweet, mode)
            if len(tokens) <= 512:
                result.append(tokens)

        return result

    return [clean_tweet(tweet, mode) for tweet in tweets]

In [58]:
result = get_cleaned_tweets(texts)
result

[['binance', 'towel', 'come', 'everywhere', 'include', 'breakfast', 'cooking'],
 ['drop',
  'sol',
  'address',
  'make',
  'sure',
  'follow',
  'dont',
  'ask',
  'backhand',
  'index',
  'point',
  'medium',
  'light',
  'skin',
  'tone'],
 ['take', 'one', 'good', 'altcoin', 'change', 'world', 'sparkle'],
 ['disrespectful', 'one', 'top', '3', 'favorite'],
 ['congressional',
  'republican',
  'call',
  'affordable',
  'connectivity',
  'program',
  'wasteful',
  'call',
  'necessary',
  'time',
  'congress',
  'extend',
  '23',
  'million',
  'household',
  'across',
  'america',
  'rely',
  'program',
  'stay',
  'connect'],
 ['donald',
  'trump',
  'wont',
  'american',
  'president',
  'must',
  'refuse',
  'denounce',
  'political',
  'violence',
  'ill',
  'say',
  'trump',
  'wont',
  'political',
  'violence',
  'never',
  'ever',
  'acceptable',
  'america'],
 ['massive', 'bitcoin', 'breakout', 'come', 'ready'],
 ['ok',
  'vote',
  'ill',
  '25',
  'ticket',
  'giveaway',
  '

In [59]:
result_bert = get_cleaned_tweets(texts, mode='bert')
result_bert


[['[CLS]',
  'the',
  '#',
  'bin',
  '##ance',
  'towel',
  'comes',
  'everywhere',
  'with',
  'me',
  '.',
  '.',
  '.',
  'including',
  'breakfast',
  ':',
  'cooking',
  ':',
  '[SEP]'],
 ['[CLS]',
  'drop',
  'your',
  '$',
  'sol',
  'address',
  'below',
  'and',
  'make',
  'sure',
  'you',
  'are',
  'following',
  'me',
  'don',
  '’',
  't',
  'ask',
  'why',
  ':',
  'back',
  '##hand',
  '_',
  'index',
  '_',
  'pointing',
  '_',
  'down',
  '_',
  'medium',
  '-',
  'light',
  '_',
  'skin',
  '_',
  'tone',
  ':',
  '[SEP]'],
 ['[CLS]',
  'it',
  'only',
  'takes',
  'one',
  'good',
  'alt',
  '##co',
  '##in',
  'to',
  'change',
  'the',
  'world',
  '.',
  ':',
  'sparkle',
  '##s',
  ':',
  '[SEP]'],
 ['[CLS]',
  'di',
  '##sr',
  '##es',
  '##pe',
  '##ct',
  '##ful',
  'is',
  'one',
  'of',
  'my',
  'top',
  '3',
  'favorite',
  '[SEP]'],
 ['[CLS]',
  'some',
  'congressional',
  'republicans',
  'call',
  'the',
  'affordable',
  'connectivity',
  'program'

### Save Output

In [62]:
import pickle

In [63]:
# Save the list of tokens to a file using Pickle
with open('data/preprocessing_output.pkl', 'wb') as file:
    pickle.dump(result, file)

# save dataframe as a CSV file
preprocessed_data = [' '.join(document) for document in result]
df_data = pd.DataFrame(preprocessed_data, columns=['Content'])
df_data['Label'] = df_labels
df_data.to_csv('data/preprocessed_data.csv', index=False, encoding='utf-8-sig')

### Save Output for BERT

In [67]:
# Save the list of tokens to a file using Pickle
with open('data/preprocessing_output_bert.pkl', 'wb') as file:
    pickle.dump(result_bert, file)

# save dataframe as a CSV file
preprocessed_data_bert = [' '.join(document) for document in result_bert]
df_data = pd.DataFrame(preprocessed_data_bert, columns=['Content'])
df_data['Label'] = df_labels
df_data.to_csv('data/preprocessed_data_bert.csv', index=False, encoding='utf-8-sig')

### Load Output

In [64]:
# to read the pkl file:
with open('data/preprocessing_output.pkl', 'rb') as file:
    loaded_list = pickle.load(file)
    
# Print the dictionary
print(loaded_list)
print(len(loaded_list))

# print the labels
df_labels_loaded = pd.read_pickle('data/labels.pkl')
print(df_labels_loaded)
print(len(df_labels_loaded))

# read the CSV file
df_data_loaded = pd.read_csv('data/preprocessed_data.csv', encoding='utf-8-sig')
print(df_data_loaded)

839
0      1
1      1
2      2
3      2
4      1
      ..
834    1
835    0
836    2
837    2
838    0
Name: label, Length: 839, dtype: int64
839
                                               Content  Label
0    binance towel come everywhere include breakfas...      1
1    drop sol address make sure follow dont ask bac...      1
2           take one good altcoin change world sparkle      2
3                     disrespectful one top 3 favorite      2
4    congressional republican call affordable conne...      1
..                                                 ...    ...
834  ultimate guide festival celebrate latin music ...      1
835                       lady gaga fucking movie star      0
836  te har salir volando como un cohete rocket col...      2
837  youre invite see rogueport may 23 papermarioth...      2
838                                       alien mother      0

[839 rows x 2 columns]


In [65]:
# read the CSV file
df_data_loaded_bert = pd.read_csv('data/preprocessed_data_bert.csv', encoding='utf-8-sig')
print(df_data_loaded_bert)

                                               Content  Label
0    [CLS] the # bin ##ance towel comes everywhere ...      1
1    [CLS] drop your $ sol address below and make s...      1
2    [CLS] it only takes one good alt ##co ##in to ...      2
3    [CLS] di ##sr ##es ##pe ##ct ##ful is one of m...      2
4    [CLS] some congressional republicans call the ...      1
..                                                 ...    ...
834  [CLS] the ultimate guide to festivals celebrat...      1
835      [CLS] lady gaga is a fucking movie star [SEP]      0
836  [CLS] te hare sal ##ir vol ##ando como un co #...      2
837  [CLS] you ’ re invited ! see you in rogue ##po...      2
838                          [CLS] alien mothers [SEP]      0

[839 rows x 2 columns]
