# Import libraries

In [1]:
# Set random seed

SEED = 1234509876

# Importing basic libraries
from zipfile import ZipFile
import os, sys
import re
import gc
import time
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json 
from string import punctuation
import pickle as pkl

%matplotlib inline

# Import NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
from sklearn.metrics import f1_score

from wordcloud import WordCloud
# Import models

import lightgbm as lgb
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Model selection
from sklearn.model_selection import RandomizedSearchCV

# Others

from tqdm import tqdm_notebook #Loads progressbars for various loops

from typing import List
import warnings

warnings.filterwarnings('ignore')

#####################
#Useful pandas settings

pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 160)
pd.set_option('display.max_colwidth', 40)
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Custom Functions

Collecting all functions here for easy reference and update

In [2]:
################################################################################################

# Downcasting function for pandas dataframes

def downcast_dtypes(df):
  '''
      Changes column types in the dataframe:             
          `float64` type to lowest possible float without data loss
          `int64`   type to lowest possible int wihtout data loss
  '''
  
  # Select columns to downcast
  float_cols = [col for col in df if df[col].dtype == "float64"]
  int_cols =   [col for col in df if df[col].dtype == "int64"]
  
  # Downcast columns using to numeric function
  df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
  df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')

  # remove variables from memory to avoid issues
  
  del float_cols
  del int_cols
  
  return df

################################################################################################
# Check duplication at given level of dataframe

def check_dups(df, cols):

  orig_count_rows = df.shape[0]

  temp = df.groupby(cols).size().reset_index(name = 'counts')

  dedup_count_rows = temp.shape[0]

  if orig_count_rows == dedup_count_rows:
    print("No duplicates. Dataframe is unique at given level")
    print("# of unique entries: n=",orig_count_rows)
  else:
    print("Duplicates found. Dataframe is not unique at given level")
    print("# of entries in original dataset: n=", orig_count_rows)
    print("# of unique entries expected in deduped dataset: n=", dedup_count_rows)
    print("# of addational entries: n=", orig_count_rows - dedup_count_rows)

  del orig_count_rows, temp, dedup_count_rows

#####################################################################################
# Plotting classification features
def fancy_plot(df):
  column_names = list(df.columns.values)
  frauds = df[df['Class'] == 1]
  no_frauds = df[df['Class'] == 0]

  plt.figure()
  fig, ax = plt.subplots(8,4,figsize=(16,28))
  i = 0
  for feature in column_names:
      i += 1
      plt.subplot(8,4,i)
      sns.kdeplot(frauds[feature])
      sns.kdeplot(no_frauds[feature])
      plt.xlabel(feature, fontsize=10)
      locs, labels = plt.xticks()
      plt.tick_params(axis='both', which='major', labelsize=12)
  plt.show();

####################################################################################

########################################
#Custom function to apply functions to dataframe with missing values

def impute_missing(df, func, target_col, new_col_name):
  df.loc[~df[target_col].isnull(),new_col_name] = df.loc[~df[target_col].isnull(),target_col].apply(func)


####################################################################################
#text cleaning and stemming function. Modified to cater to text provided

def remove_links(raw):
    # Extracts links from input text. Returns both text and links 
    link_expr = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+.'
    
    # Check if passed object is single string or series
    if type(raw) == str:
        no_link_raw = re.sub(link_expr,"",raw)
        # links = re.findall(link_expr,"",raw)
    else:
        no_link_raw = list()
        # Disabled link extraction for now
        # links = list() 
        
        for tweet in raw:
            no_link_raw.append(re.sub(link_expr, "", tweet))
            
    return no_link_raw
    
def remove_hashtags(raw):
    # Extracts links from input text. Returns both text and links
    # Will remove all trailing hashtags
    # Hashtags in middle of text will be replaced by a "SOME_ENTITY" constant text with hoope to retain context
    
    hashtag_expr = '#[A-Za-z0-9]+'
    middle_tag_expr = '#[A-Za-z0-9]+^[#]'
    # tags = re.findall(hashtag_expr,"",raw)
    
    if type(raw) == str:
        no_tag_raw = re.sub(hashtag_expr,"",raw)
        # links = re.findall(link_expr,"",raw)
    else:
        no_tag_raw = list()
        # Disabled link extraction for now
        # links = list() 
        
        for tweet in raw:
            no_tag_raw.append(re.sub(hashtag_expr, "", tweet))
            
    return no_tag_raw
    
def replace_mentions(raw):
    # Replaces personal mentions with a common entity tag.
    # As we cannot build context on specific persons, we will tag it as entity and let our model identify language patterns
    mention_expr = '@[A-Za-z0-9]+'
    # tags = re.findall(hashtag_expr,"",raw)
    
    if type(raw) == str:
        no_mention_raw = re.sub(mention_expr,"SOME_ENTITY",raw)
        # links = re.findall(link_expr,"",raw)
    else:
        no_tag_raw = list()
        # Disabled link extraction for now
        # links = list() 
        
        for tweet in raw:
            no_tag_raw.append(re.sub(mention_expr,"SOME_ENTITY",tweet))
            
    return no_tag_raw

def trim_extra_space(raw):
    space_expr = '\s+'
    # tags = re.findall(hashtag_expr,"",raw)
    
    if type(raw) == str:
        clean_raw = re.sub(space_expr," ",raw)
        clean_raw = clean_raw.strip(" ") # Remove end trails
        # links = re.findall(link_expr,"",raw)
    else:
        clean_raw = list()
        # Disabled link extraction for now
        # links = list() 
        
        for tweet in raw:
            temp = re.sub(space_expr," ",tweet)
            clean_raw.append(temp.strip(" "))
            
    return clean_raw

def clean_text(raw):
    # Combine all cleaning work
    cleaned_text = remove_links(raw)
    cleaned_text = remove_hashtags(cleaned_text)
    cleaned_text = replace_mentions(cleaned_text)
    cleaned_text = trim_extra_space(cleaned_text)    

    return cleaned_text

    
# def token_converter():
    # Convert text to tokens
    
#     tokens = nltk.word_tokenize(temp)
    
#     alph_num_tokens = [word for word in tokens if word.isalnum()]
#     non_alph_num_tokens = [word for word in tokens if not word.isalnum()]

#     non_alph_num_tokens = [word.split('-') for word in non_alph_num_tokens]
#     non_alph_num_tokens = nltk.flatten(non_alph_num_tokens)
#     non_alph_num_tokens = [word.split('.') for word in non_alph_num_tokens]
#     non_alph_num_tokens = nltk.flatten(non_alph_num_tokens)

#     alph_num_tokens.extend(non_alph_num_tokens)

#     tokens = nltk.flatten(alph_num_tokens)

#     tokens = [porter.stem(word.lower()) for word in tokens]
#     tokens = [word for word in tokens if word not in stopwords_en]
#     tokens = [word for word in tokens if word.isalnum()]

#     return tokens

    #####################################################
# Generate word clouds

def generate_wordclouds(X, in_X_tfidf, k, in_word_positions):
    # compute the total tfidf for each term in the cluster
    in_tfidf = in_X_tfidf[in_y_pred == in_cluster_id]
    # numpy.matrix
    tfidf_sum = np.sum(in_tfidf, axis=0)
    # numpy.array of shape (1, X.shape[1])
    tfidf_sum = np.asarray(tfidf_sum).reshape(-1)
    top_indices = tfidf_sum.argsort()[-top_count:]
    term_weights = {in_word_positions[in_idx]: tfidf_sum[in_idx] for in_idx in top_indices}
    wc = WordCloud(width=1200, height=800, background_color="white")
    wordcloud = wc.generate_from_frequencies(term_weights)
    fig, ax = plt.subplots(figsize=(10, 6), dpi=100)
    ax.imshow(wordcloud, interpolation='bilinear')
    ax.axis("off")
    fig.suptitle(f"Cluster {in_cluster_id}")
    plt.show()



# Import dataset

Dataset has been uploaded to kaggle repo for easy access

In [3]:
raw_data = pd.read_csv('/kaggle/input/sarcasm/all_twitter_sarcasam.csv')

# Remove extra columns from the data
keep_cols = ['id','text']
raw_data = raw_data.loc[:,keep_cols]


# Data Cleanup

May need to label data before doing cleanup

Figure out a way to tag hashtags in middle of text (maybe after cleanup since they

In [4]:
raw_data['clean_text'] = clean_text(raw_data['text'])

# Remove duplicates. There are ton of ads that will be easy to remove after cleanup
# Ads contain different hashtags so cant be deduped raw

raw_data.drop_duplicates(subset = 'clean_text', inplace = True)
raw_data = raw_data.reset_index().drop(columns = 'index')

In [5]:
raw_data[1:20]

Unnamed: 0,id,text,clean_text
1,1623470696125923329,Get my art printed on awesome produc...,Get my art printed on awesome produc...
2,1623467236982947842,Trudeau? anyone? #TuckerCarlson #Unh...,Trudeau? anyone?
3,1623465163792711681,Nuh uh. #joebiden told me everything...,Nuh uh. told me everything was fine....
4,1623465117869395968,😂 she gave #Biden the #ChinaBalloon ...,😂 she gave the 🤣 Af 😂
5,1623464838990295040,He’s afraid of me because of my eye ...,He’s afraid of me because of my eye ...
6,1623463889194344449,How old do I have to be before I hav...,How old do I have to be before I hav...
7,1623463133170868224,https://t.co/DOHN1RsqCg… Very funny ...,Very funny alternative printed gifts
8,1623461020382494720,"Hmm ... Can't attach pics, now, too...","Hmm ... Can't attach pics, now, too?..."
9,1623460933900066816,Looks like this toerag has really an...,Looks like this toerag has really an...
10,1623434796432248833,I had a good laugh today responding ...,I had a good laugh today responding ...


In [6]:
# Puicking a sample for testing
rng = np.random.default_rng(SEED)
sample = rng.integers(low = 0, high = len(raw_data)-1,size =100)

test = raw_data.loc[sample,'text']
test

635     @OV_Matter @samuelr28254292 @sinenom...
85      @TheUfoJoe @SilvaRecord But this has...
964     @SouthlandPost @FightHaven Oh, how l...
177     Excited to share this item from my #...
394     Make yourself the centre of attentio...
714     At the book store\n\n#comics #stripc...
533     Find your #scifi #fantasy #freebooks...
1192    When regulators imposed a historic l...
955     @ChanelRion @OANN Wait. \nIs this on...
148     Go out in style, with a #Logo from B...
938     @Acyn How much education does Watter...
205     @RonFilipkowski 🤔 I noticed Jim didn...
325     Love is blind…. But…. #fun #Sarcasm ...
880     @claudiastellner You don't lack cour...
1183    @Reuters \n\n"Glad to see that the g...
198     @AlboMP About time. My contract expl...
605     @EricPoppen1 Oh great, another day w...
1179    You can't be 'ON' every day https://...
145     #Cartoon of the Day\n\nClick the lin...
302     I'm sure he couldn't wait for the re...
728     @maggiesummer2 @RoseHowe1 @Senat

In [7]:
temp = clean_text(test,return_type = 'token')
temp

TypeError: clean_text() got an unexpected keyword argument 'return_type'

# Notes::

1. Maybe number of hashtags as feature. 