# Part 1 - Data extraction and preprocessing

Existing ARTK twitter dataset (widely used in sentiment analysis research) has been used for this project due to lack of access to API (and prohibitive costs).

# Import libraries

In [10]:
SEED = 1234509876 # Set randomisation seed, used for consistency in results 

# Basic libraries
from zipfile import ZipFile
import os, sys
import re
import gc
import time
import datetime

# Base Data processing and plotting
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Text processing specific libraries
import json 
from string import punctuation
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Data import-export
import pyarrow as pa
import pyarrow.parquet as pq

# Others
from tqdm import tqdm_notebook #Loads progressbars for various loops
import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Useful settings and pre setup

In [None]:
# Useful matplotlib settings
%matplotlib inline

# Useful pandas settings
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 160)
pd.set_option('display.max_colwidth', 40)
warnings.filterwarnings("ignore")

# Download required text dictionaries
nltk.download('punkt')
nltk.download('stopwords')


# Custom Functions

Collecting all functions here for easy reference and update

In [8]:
################################################################################################
# Downcasting function for pandas dataframes

def downcast_dtypes(df):
    '''
    Changes column types in the dataframe:             
      `float64` type to lowest possible float without data loss
      `int64`   type to lowest possible int wihtout data loss
    '''

    # Select columns to downcast
    float_cols = [col for col in df if df[col].dtype == "float64"]
    int_cols =   [col for col in df if df[col].dtype == "int64"]

    # Downcast columns using to numeric function
    df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
    df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')

    # remove variables from memory to avoid issues

    del float_cols
    del int_cols

    return df

################################################################################################
# Check duplication at given level of dataframe

def check_dups(df, cols):

    orig_count_rows = df.shape[0]

    temp = df.groupby(cols).size().reset_index(name = 'counts')

    dedup_count_rows = temp.shape[0]

    if orig_count_rows == dedup_count_rows:
        print("No duplicates. Dataframe is unique at given level")
        print("# of unique entries: n=",orig_count_rows)
    else:
        print("Duplicates found. Dataframe is not unique at given level")
        print("# of entries in original dataset: n=", orig_count_rows)
        print("# of unique entries expected in deduped dataset: n=", dedup_count_rows)
        print("# of addational entries: n=", orig_count_rows - dedup_count_rows)

    del orig_count_rows, temp, dedup_count_rows

#####################################################################################
# Plotting classification features
def fancy_plot(df):
    column_names = list(df.columns.values)
    frauds = df[df['Class'] == 1]
    no_frauds = df[df['Class'] == 0]

    plt.figure()
    fig, ax = plt.subplots(8,4,figsize=(16,28))
    i = 0
    for feature in column_names:
        i += 1
        plt.subplot(8,4,i)
        sns.kdeplot(frauds[feature])
        sns.kdeplot(no_frauds[feature])
        plt.xlabel(feature, fontsize=10)
        locs, labels = plt.xticks()
        plt.tick_params(axis='both', which='major', labelsize=12)
    plt.show();

####################################################################################

########################################
#Custom function to apply functions to dataframe with missing values
def impute_missing(df, func, target_col, new_col_name):
    df.loc[~df[target_col].isnull(),new_col_name] = df.loc[~df[target_col].isnull(),target_col].apply(func)


####################################################################################
#text cleaning and stemming function. Modified to cater to text provided

def remove_links(raw):
    # Extracts links from input text. Returns both text and links 
    link_expr = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+.'
    
    # Check if passed object is single string or series
    if type(raw) == str:
        no_link_raw = re.sub(link_expr,"",raw)
        # links = re.findall(link_expr,"",raw)
    else:
        no_link_raw = list()
        # Disabled link extraction for now
        # links = list() 
        
        for tweet in raw:
            no_link_raw.append(re.sub(link_expr, "", tweet))
            
    return no_link_raw
    
def remove_hashtags(raw):
    # Extracts links from input text. Returns both text and links
    # Will remove all trailing hashtags
    # Hashtags in middle of text will be replaced by a "SOME_ENTITY" constant text with hoope to retain context
    
    hashtag_expr = '#[A-Za-z0-9]+'
    middle_tag_expr = '#[A-Za-z0-9]+^[#]'
    # tags = re.findall(hashtag_expr,"",raw)
    
    if type(raw) == str:
        no_tag_raw = re.sub(hashtag_expr,"",raw)
        # links = re.findall(link_expr,"",raw)
    else:
        no_tag_raw = list()
        # Disabled link extraction for now
        # links = list() 
        
        for tweet in raw:
            no_tag_raw.append(re.sub(hashtag_expr, "", tweet))
            
    return no_tag_raw
    
def replace_mentions(raw):
    # Replaces personal mentions with a common entity tag.
    # As we cannot build context on specific persons, we will tag it as entity and let our model identify language patterns
    mention_expr = '@[A-Za-z0-9]+'
    # tags = re.findall(hashtag_expr,"",raw)
    
    if type(raw) == str:
        no_mention_raw = re.sub(mention_expr," SOME_ENTITY ",raw) # Space to avoid potential merging with other words. 
        # links = re.findall(link_expr,"",raw)
    else:
        no_tag_raw = list()
        # Disabled link extraction for now
        # links = list() 
        
        for tweet in raw:
            no_tag_raw.append(re.sub(mention_expr," SOME_ENTITY ",tweet)) # Space to avoid potential merging with other words.
            
    return no_tag_raw

def trim_extra_space(raw):
    space_expr = '\s+'
    # tags = re.findall(hashtag_expr,"",raw)
    
    if type(raw) == str:
        clean_raw = re.sub(space_expr," ",raw)
        clean_raw = clean_raw.strip(" ") # Remove end trails
        # links = re.findall(link_expr,"",raw)
    else:
        clean_raw = list()
        # Disabled link extraction for now
        # links = list() 
        
        for tweet in raw:
            temp = re.sub(space_expr," ",tweet)
            clean_raw.append(temp.strip(" "))
            
    return clean_raw

def clean_text(raw):
    # Combine all cleaning work
    cleaned_text = remove_links(raw)
    cleaned_text = remove_hashtags(cleaned_text)
    cleaned_text = replace_mentions(cleaned_text)
    cleaned_text = trim_extra_space(cleaned_text)    

    return cleaned_text

def simple_emoji_list(text):
    # Modification to emoji list function.
    # Removes the start and end character poiitns, and just retains actual emojis for easy parsing
    emojis = emoji.emoji_list(text)
    clean_list = list()
    if len(emojis)>0:
        for each in emojis:
            clean_list.append(emoji.demojize(each['emoji']))
    
    return clean_list
  

# def token_converter():
    # Convert text to tokens
    
#     tokens = nltk.word_tokenize(temp)
    
#     alph_num_tokens = [word for word in tokens if word.isalnum()]
#     non_alph_num_tokens = [word for word in tokens if not word.isalnum()]

#     non_alph_num_tokens = [word.split('-') for word in non_alph_num_tokens]
#     non_alph_num_tokens = nltk.flatten(non_alph_num_tokens)
#     non_alph_num_tokens = [word.split('.') for word in non_alph_num_tokens]
#     non_alph_num_tokens = nltk.flatten(non_alph_num_tokens)

#     alph_num_tokens.extend(non_alph_num_tokens)

#     tokens = nltk.flatten(alph_num_tokens)

#     tokens = [porter.stem(word.lower()) for word in tokens]
#     tokens = [word for word in tokens if word not in stopwords_en]
#     tokens = [word for word in tokens if word.isalnum()]

#     return tokens

    #####################################################
# Generate word clouds

def generate_wordclouds(X, in_X_tfidf, k, in_word_positions):
    # compute the total tfidf for each term in the cluster
    in_tfidf = in_X_tfidf[in_y_pred == in_cluster_id]
    # numpy.matrix
    tfidf_sum = np.sum(in_tfidf, axis=0)
    # numpy.array of shape (1, X.shape[1])
    tfidf_sum = np.asarray(tfidf_sum).reshape(-1)
    top_indices = tfidf_sum.argsort()[-top_count:]
    term_weights = {in_word_positions[in_idx]: tfidf_sum[in_idx] for in_idx in top_indices}
    wc = WordCloud(width=1200, height=800, background_color="white")
    wordcloud = wc.generate_from_frequencies(term_weights)
    fig, ax = plt.subplots(figsize=(10, 6), dpi=100)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")
    fig.suptitle(f"Cluster {in_cluster_id}")
    plt.show()



# Import dataset

Dataset has been uploaded to kaggle repo for easy access

In [None]:
raw_data = pd.read_csv('/kaggle/input/sarcasm/all_twitter_sarcasam.csv')

# Remove extra columns from the data
keep_cols = ['id','text']
raw_data = raw_data.loc[:,keep_cols]

# Convert tweets to lowcase before proceeding with anything
raw_data['text'] = raw_data['text'].str.lower()

# Label data

In [None]:
temp = raw_data.loc[:,'text']
labels = list()
sarcasm_tags = set(("#sarcasm","#sarcastic"))

for tweet in temp:
    lowcase_tweet = tweet.lower()
    hashtags = set(re.findall('#[A-Za-z0-9]+',lowcase_tweet))
    if any(hashtags & sarcasm_tags):
        labels.append("sarcasm")
    else:
        labels.append("non-sarcasm")
        
raw_data['labels'] = labels
raw_data.head()

# Data Cleanup

## Steps done:
1. Cleanup text:
    1. Lowercase text
    1. Remove hashtags
    1. Replace mentions with common tag "SOME_ENTITY"
    1. remove links (usually adverts have them)
1. Extract list of emojis for easy reference.
    1. Replace emojis with their text equivalents
    1. Remove the skin tone color information as that may not help with context
1. De-contract short form words (e.g. it's =>> it is)
1. TODO - Spell checking (not sure if it will help somehow.. Need to test later)
1. Lemmatization - Cannot use for BERT. Maybe we use normal word embeddings for first latyer, and BERT for 2nd..NOt sure

In [None]:
raw_data['clean_text'] = clean_text(raw_data['text'])

# Remove duplicates. There are ton of ads that will be easy to remove after cleanup
# Ads contain different hashtags so cant be deduped raw

raw_data.drop_duplicates(subset = 'clean_text', inplace = True)
raw_data = raw_data.reset_index().drop(columns = 'index')

## Extract emojis and replace with text
We will dedup and check if we need to standardize emojis and remove tone info

In [None]:
# Extract emojis for analysis
raw_data['emoji_list'] = raw_data['clean_text'].apply(simple_emoji_list)
# Replace emojis with text versions
raw_data['clean_text'] = raw_data['clean_text'].apply(emoji.demojize)
# Remove tone info to sandardize emojis
raw_data['clean_text'] = raw_data['clean_text'].apply(lambda text: re.sub('_[A-Za-z -]+_skin_tone:', ":", text))
raw_data['emoji_list'] = raw_data['emoji_list'].apply(lambda text: [re.sub('_[A-Za-z -]+_skin_tone:', ":", x) for x in text])

raw_data.head()

## Convert shortforms to longforms

Solution is courtsey of https://stackoverflow.com/questions/43018030/replace-apostrophe-short-words-in-python

In [None]:
def decontracted(phrase):
    # Including all sorts of inverted commas to ensure text gets converted without issues with different posts using different keyboards
    
    # Specific exceptions to avoid mis spelled words
    phrase = re.sub(r"won[\'\`\’]t", "will not", phrase)
    phrase = re.sub(r"can[\'\`\’]t", "can not", phrase)

    # general concatenations
    phrase = re.sub(r"n[\'\`\’]t", " not", phrase)
    phrase = re.sub(r"[\'\`\’]re", " are", phrase)
    phrase = re.sub(r"[\'\`\’]s", " is", phrase)
    phrase = re.sub(r"[\'\`\’]d", " would", phrase)
    phrase = re.sub(r"[\'\`\’]ll", " will", phrase)
    phrase = re.sub(r"[\'\`\’]t", " not", phrase)
    phrase = re.sub(r"[\'\`\’]ve", " have", phrase)
    phrase = re.sub(r"[\'\`\’]m", " am", phrase)
    return phrase

print(raw_data.loc[11,'clean_text'])
raw_data['clean_text'] = raw_data['clean_text'].apply(decontracted)


## Spell checking - RUNS SLOW. Expect 1-2 seconds per tweet. Ignore until final dataset

Too time consuming. May need to run it once and store final data

In [None]:
spell_checker = SpellChecker(distance=0) # TO reduce processing time and pick 1 closest word automatically.

# Lambda function to return original word if spell check does not find anything
custom_spell_check = lambda word: word if spell_checker.correction(word) == None else spell_checker.correction(word)

# raw_data['clean_text'] = raw_data['clean_text'].apply(lambda text: )
raw_data['clean_text'] = raw_data['clean_text'].map(lambda x:" ".join(custom_spell_check(word) for word in x.split(" ")))

raw_data.head()

In [None]:
# Saving the dataset as it takes too long now
import pyarrow as pa
import pyarrow.parquet as pq

save_raw_data = pa.Table.from_pandas(raw_data)
pq.write_table(save_raw_data, '/kaggle/working/clean_raw_data.parquet')

## Lemmatization

BERT should not like stopword removal and lemmatization due to how it works. 
We will make a new column to store lemmatized + stopword removed text and see if performance changes

In [13]:
# # Import code for resume
# temp = pq.read_table('/kaggle/input/sarcasm/clean_raw_data.parquet')

# raw_data = temp.to_pandas()

In [18]:
temp = raw_data.loc[:,['clean_text']]

lemmatizer = WordNetLemmatizer()
stopword_list = stopwords.words('english')

# Can possibly integrate POS here it seems
raw_data['clean_text_lem'] = raw_data['clean_text'].apply(lambda tweet: " ".join(lemmatizer.lemmatize(word) for word in tweet.split(" ")))
# Remove stopwords
raw_data['clean_text_lem_stop'] = raw_data['clean_text_lem'].apply(lambda tweet: " ".join(word for word in tweet.split(" ") if word not in stopword_list))

raw_data.head()

Unnamed: 0,id,text,labels,clean_text,emoji_list,clean_text_lem,clean_text_lem_stop
0,1623471399825293312,@annielayer @repmtg i'm sure the ame...,sarcasm,SOME_ENTITY SOME_ENTITY i am sure th...,[],SOME_ENTITY SOME_ENTITY i am sure th...,SOME_ENTITY SOME_ENTITY sure america...
1,1623470696125923329,get my art printed on awesome produc...,sarcasm,get my art printed on awesome produc...,[],get my art printed on awesome produc...,get art printed awesome product redo...
2,1623467236982947842,trudeau? anyone? #tuckercarlson #unh...,sarcasm,trudeau? anyone,[],trudeau? anyone,trudeau? anyone
3,1623465163792711681,nuh uh. #joebiden told me everything...,sarcasm,nuh uh told me everything was fine.:...,[:woman_shrugging:],nuh uh told me everything wa fine.:w...,nuh uh told everything wa fine.:woma...
4,1623465117869395968,😂 she gave #biden the #chinaballoon ...,sarcasm,:face_with_tears_of_joy: she gave th...,"[:face_with_tears_of_joy:, :rolling_...",:face_with_tears_of_joy: she gave th...,:face_with_tears_of_joy: gave :rolli...


In [19]:
# Save output for easy use::


save_raw_data = pa.Table.from_pandas(raw_data)
pq.write_table(save_raw_data, '/kaggle/working/clean_raw_data.parquet')

# Quick EDA

Check simple word density based on the labelled tweets

In [None]:
sarcasm_tweets = raw_data.loc[raw_data['labels'] == 'sarcasm','clean_text']
sarcasm_all_words = ' '.join(sarcasm_tweets)

from wordcloud import WordCloud

wordcloud = WordCloud(width=800, height=800, background_color='white', max_words=100, colormap='viridis', contour_width=3, contour_color='black')

# Generate the word cloud
wordcloud.generate(sarcasm_all_words)
plt.figure(figsize=(8,8), facecolor=None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()

In [None]:
# Simple table with frequency to check if there has been issue with prepriocessing
tqdm_notebook.pandas(desc = "Spell check progress")
all_word_list = pd.DataFrame(data = (" ".join(raw_data['clean_text']).split(" ")), columns = ["words"])
all_word_list['count'] = 1

word_count = all_word_list.groupby(['words'],as_index = False).count()
word_count.sort_values(by = 'count', ascending=False)

# Feature Generation

In [None]:
plt.show()

# Notes::

1. Maybe number of hashtags as feature. 