In [1]:
import pandas as pd
import emoji

In [2]:
df = pd.read_csv(r"Sarcasm Dataset.csv")
df

Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,The only thing I got from college is a caffein...,1,0.0,1.0,0.0,0.0,0.0,0.0
1,I love it when professors draw a big question ...,1,1.0,0.0,0.0,0.0,0.0,0.0
2,Remember the hundred emails from companies whe...,1,0.0,1.0,0.0,0.0,0.0,0.0
3,Today my pop-pop told me I was not “forced” to...,1,1.0,0.0,0.0,0.0,0.0,0.0
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
3463,The population spike in Chicago in 9 months is...,0,,,,,,
3464,You'd think in the second to last English clas...,0,,,,,,
3465,I'm finally surfacing after a holiday to Scotl...,0,,,,,,
3466,Couldn't be prouder today. Well done to every ...,0,,,,,,


In [3]:
import re
import emoji

def preprocess_text(text):
    """
    Preprocesses text by removing URLs, emails, emojis, extra whitespace,
    hashtags, and specific characters. Also replaces "iâ€™m" with "i am".

    Args:
        text: The input text string.

    Returns:
        The preprocessed text string.
    """

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove emails
    text = re.sub(r'[a-z0-9]+@[a-z]+\.[a-z]{2,}', '', text)

    # Remove emojis
    text = emoji.replace_emoji(text, replace="")

    # Remove specific characters
    text = re.sub(r'[¯\_(ãƒ„)_/Â¯]', '', text)

    # Remove @mentions
    text = re.sub(r'@\w+', '', text)

    # Replace "iâ€™m" with "i am"
    text = re.sub(r'i\s*â€™m', 'i am', text, flags=re.IGNORECASE)

    # Lowercasing
    text = text.lower()

    # Extra white space
    text = re.sub(r'\s+', ' ', text)

    # Remove hashtags (if you only want to remove the "#" symbol)
    text = re.sub(r'#', '', text)

    return text



In [4]:
df['tweet'].isna().sum()
df['tweet'].fillna('', inplace=True)

In [5]:
df['tweet'] = df['tweet'].astype(str)

In [6]:
df['cleaned_tweet'] = df['tweet'].apply(lambda x: preprocess_text(x))

In [7]:
df['cleaned_tweet']

0       the only thing i got from college is a caffein...
1       i love it when professors draw a big question ...
2       remember the hundred emails from companies whe...
3       today my pop-pop told me i was not “forced” to...
4        i did too, and i also reported cancun cruz no...
                              ...                        
3463    the population spike in chicago in 9 months is...
3464    you'd think in the second to last english clas...
3465    i'm finally surfacing after a holiday to scotl...
3466    couldn't be prouder today. well done to every ...
3467    overheard as my 13 year old games with a frien...
Name: cleaned_tweet, Length: 3468, dtype: object

In [8]:
df.shape

(3468, 9)

In [9]:
df.head()

Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question,cleaned_tweet
0,The only thing I got from college is a caffein...,1,0.0,1.0,0.0,0.0,0.0,0.0,the only thing i got from college is a caffein...
1,I love it when professors draw a big question ...,1,1.0,0.0,0.0,0.0,0.0,0.0,i love it when professors draw a big question ...
2,Remember the hundred emails from companies whe...,1,0.0,1.0,0.0,0.0,0.0,0.0,remember the hundred emails from companies whe...
3,Today my pop-pop told me I was not “forced” to...,1,1.0,0.0,0.0,0.0,0.0,0.0,today my pop-pop told me i was not “forced” to...
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,1.0,0.0,0.0,0.0,0.0,0.0,"i did too, and i also reported cancun cruz no..."


In [10]:
df['sarcastic'].value_counts()
#1 is sarcastic 
#0 is not sarcastic

0    2601
1     867
Name: sarcastic, dtype: int64

In [11]:
from imblearn.over_sampling import RandomOverSampler 
import numpy as np

def balance_df(df, text,target):
    ros = RandomOverSampler()
    train_x, train_y = ros.fit_resample(np.array(df[text]).reshape(-1,1),np.array(df[target]).reshape(-1,1))
    new_df = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = [text, target])
    
    return new_df

sarcasm_df = pd.DataFrame()
sarcasm_df = balance_df(df, 'cleaned_tweet', 'sarcastic')

In [12]:
sarcasm_df['sarcastic'].value_counts()

1    2601
0    2601
Name: sarcastic, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df['cleaned_tweet']
y = df['sarcastic']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Downloading Pretrained Bert and Tokenizer

In [15]:
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer

  from .autonotebook import tqdm as notebook_tqdm





In [16]:
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer

model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

### Tokenization and Encoding (padding, Truncation, max_length)

Tokenization is the process of dividing text into smaller units called dtokens, which can be words, phrases, subwords , or chars.

Padding is used to match the length of shorter sentences with the largest sentence.
e.g. 

eg. the cat sat on the mat.

"the", "cat", "sat", "on", "the", "mat".

In [18]:
text = ['hey me', 'how are you']
tokenizer(text)

{'input_ids': [[101, 4931, 2033, 102], [101, 2129, 2024, 2017, 102]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1]]}