In [48]:
import csv
import json
import pandas as pd
import re
import gensim
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.utils import resample
import nltk

# Ensure required NLTK resources are available
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /home/dyang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/dyang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [49]:
import torch

torch.cuda.is_available()

True

In [50]:
def convert(integer):
    if (integer == 'positive'):
        return 1
    elif (integer == 'negative'):
        return -1
    elif (integer == 'neutral'):
        return 0

In [94]:
# collect our datapoints
sentiment_reddit = pd.read_csv('sentiment_data/Reddit_Data.csv')
sentiment_twitter = pd.read_csv('sentiment_data/Twitter_Data.csv')

sarcasm = []
with open("sarcasm_data/Sarcasm_Headlines_Dataset_v2.json", 'r', encoding='utf-8') as f:
    for line in f:
        sarcasm.append(json.loads(line))
sarcasm_v2 = pd.DataFrame(sarcasm)

sarcasm = []
with open ("sarcasm_data/Sarcasm_Headlines_Dataset.json", 'r', encoding='utf-8') as f:
    for line in f:
        sarcasm.append(json.loads(line))
sarcasm_v1 = pd.DataFrame(sarcasm)

# concactenate the datasets
sentiments = pd.concat([sentiment_reddit, sentiment_twitter])
sarcasm = pd.concat([sarcasm_v1, sarcasm_v2])

print("total datapoints: ", sentiments.shape[0])
print("number of positive sentiments: ", sentiments[sentiments['category'] == 1].shape[0])
print("number of neutral sentiments: ", sentiments[sentiments['category'] == 0].shape[0])
print("number of negative sentiments: ", sentiments[sentiments['category'] == -1].shape[0])

data_majority = sentiments[sentiments['category'] == 1]
data_minority = sentiments[sentiments['category'] == 0]
data_minority2 = sentiments[sentiments['category'] == -1]

data_minority_upsampled = resample(data_minority,
                                   replace=True,
                                   n_samples=data_majority.shape[0], 
                                   random_state=8)

data_minority2_upsampled = resample(data_minority2,
                                    replace=True,
                                    n_samples=data_majority.shape[0],
                                    random_state=8)

sentiments_upsampled = pd.concat([data_majority, data_minority_upsampled, data_minority2_upsampled])

print("After upsampling:")
print(sentiments_upsampled['category'].value_counts())

total datapoints:  200229
number of positive sentiments:  88080
number of neutral sentiments:  68355
number of negative sentiments:  43787
After upsampling:
category
 1.0    88080
 0.0    88080
-1.0    88080
Name: count, dtype: int64


In [95]:
# Now we also need to upsample the sarcasm dataset

print("total datapoints: ", sarcasm.shape[0])
print("number of sarcastic headlines: ", sarcasm[sarcasm['is_sarcastic'] == 1].shape[0])
print("number of non-sarcastic headlines: ", sarcasm[sarcasm['is_sarcastic'] == 0].shape[0])

data_majority = sarcasm[sarcasm['is_sarcastic'] == 0]
data_minority = sarcasm[sarcasm['is_sarcastic'] == 1]


data_minority_upsampled = resample(data_minority,
                                      replace=True,
                                      n_samples=data_majority.shape[0],
                                      random_state=8) 

sarcasm_upsampled = pd.concat([data_majority, data_minority_upsampled])

print("After upsampling:")
print(sarcasm_upsampled['is_sarcastic'].value_counts())


total datapoints:  55328
number of sarcastic headlines:  25358
number of non-sarcastic headlines:  29970
After upsampling:
is_sarcastic
0    29970
1    29970
Name: count, dtype: int64


In [96]:

def depure_data(data):
    if pd.isna(data):
        return ""  # Return empty string for missing values
    data = str(data)  # Convert data to string to ensure compatibility with regex
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub('', data)
    data = re.sub('\S*@\S*\s?', '', data)
    data = re.sub('\s+', ' ', data)
    data = re.sub("'", "", data)
    return data

def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)

def preprocess(data_frame, column_name='clean_comment', remove_stopwords=True, lemmatize=True):
    temp = data_frame[column_name].apply(depure_data).tolist()
    data_words = list(sent_to_words(temp))
    if remove_stopwords:
        stop_words = gensim.parsing.preprocessing.STOPWORDS
        data_words = [[word for word in doc if word not in stop_words] for doc in data_words]
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        data_words = [[lemmatizer.lemmatize(word) for word in doc] for doc in data_words]
    data = [TreebankWordDetokenizer().detokenize(words) for words in data_words]
    return data

In [97]:
print(sentiments_upsampled.columns)
print(sarcasm_upsampled.columns)

print(sarcasm_upsampled.head())


Index(['clean_comment', 'category', 'clean_text'], dtype='object')
Index(['article_link', 'headline', 'is_sarcastic'], dtype='object')
                                        article_link  \
0  https://www.huffingtonpost.com/entry/versace-b...   
1  https://www.huffingtonpost.com/entry/roseanne-...   
4  https://www.huffingtonpost.com/entry/jk-rowlin...   
5  https://www.huffingtonpost.com/entry/advancing...   
6  https://www.huffingtonpost.com/entry/how-meat-...   

                                            headline  is_sarcastic  
0  former versace store clerk sues over secret 'b...             0  
1  the 'roseanne' revival catches up to our thorn...             0  
4  j.k. rowling wishes snape happy birthday in th...             0  
5                        advancing the world's women             0  
6     the fascinating case for eating lab-grown meat             0  


In [98]:
sentiment_data = preprocess(sentiments_upsampled)
sarcasm_data = preprocess(sarcasm_upsampled, column_name='headline')

In [99]:
# before
print(sentiments_upsampled['clean_comment'].iloc[3])
print(sarcasm_upsampled['headline'].iloc[3])
# after
print(sarcasm_data[3])
print(sentiment_data[3])

# lengths
print(len(sarcasm_data))
print(len(sentiment_data))

 was teens when discovered zen meditation was then undiagnosed bpd being homeschooled and just gotten 56k modem with web connection where came across link zen meditation tried for couple weeks and the change was palpable felt the most profound sense peace ever felt grades immediately started going had more energy started martial arts just huge positive change all around parents asked something was wrong fundie parents when anything changes this was where naiveté kicked foolishly told them been trying meditation and really calmed down thought they happy that found something that helped but never forget what happened next mother affected this mockingly calm breathy voice she said you can pretend calm and happy all you want but without jesus you never content was that moment that any belief had christian faith all vanished completely realized that she had probably never been happy ever have never felt profoundly sorry for someone did for her that moment 
advancing the world's women
advanc

In [None]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [110]:
# convert the text data into tokenized data (numerical data)
max_len = 120
max_words =tokenizer.vocab_size

assert isinstance(sentiment_data, list), "sentiment_data should be a list of strings"
assert isinstance(sarcasm_data, list), "sarcasm_data should be a list of strings"

data_sentiment = sentiment_data
tokenized_sentiment = tokenizer(data_sentiment, padding=True, truncation=True, max_length=max_len, return_tensors='pt')

data_sarcasm = sarcasm_data
tokenized_sarcasm = tokenizer(data_sarcasm, padding=True, truncation=True, max_length=max_len, return_tensors='pt')

assert tokenized_sentiment['input_ids'].shape[1] == max_len
assert tokenized_sarcasm['input_ids'].shape[1] == max_len, f"Expected {max_len} but got {tokenized_sarcasm['input_ids'].shape[1]} for sarcasm data"

In [111]:
# Make the label tensors
adjusted_labels_sentiment = [label + 1 for label in sentiments_upsampled['category']]
labels_sentiments = torch.tensor(adjusted_labels_sentiment)

adjusted_labels_sarcasm = [label for label in sarcasm_upsampled['is_sarcastic']]
labels_sarcasm = torch.tensor(adjusted_labels_sarcasm)

print(labels_sarcasm, len(labels_sarcasm))
print(labels_sentiments, len(labels_sentiments))

tensor([0, 0, 0,  ..., 1, 1, 1]) 59940
tensor([2., 2., 2.,  ..., 0., 0., 0.]) 264240


In [112]:
print(tokenized_sentiment['input_ids'].shape)
print(len(labels_sarcasm))

torch.Size([264240, 120])
59940


In [113]:
X_train_sentiment_ids, X_test_sentiment_ids, X_train_sentiment_mask, X_test_sentiment_mask, y_train_sentiment, y_test_sentiment = train_test_split(
    tokenized_sentiment['input_ids'], tokenized_sentiment['attention_mask'], labels_sentiments, test_size=0.25)

X_train_sarcasm_ids, X_test_sarcasm_ids, X_train_sarcasm_mask, X_test_sarcasm_mask, y_train_sarcasm, y_test_sarcasm = train_test_split(
    tokenized_sarcasm['input_ids'], tokenized_sarcasm['attention_mask'], labels_sarcasm, test_size=0.2)

print(len(X_train_sentiment_ids), len(X_test_sentiment_ids), len(y_train_sentiment), len(y_test_sentiment))
print(len(X_train_sarcasm_ids), len(X_test_sarcasm_ids), len(y_train_sarcasm), len(y_test_sarcasm))
print(X_train_sarcasm_ids.shape, X_train_sarcasm_mask.shape, y_train_sarcasm.shape)

198180 66060 198180 66060
47952 11988 47952 11988
torch.Size([47952, 120]) torch.Size([47952, 120]) torch.Size([47952])


In [56]:
# import the pytorch dependencies
import torch
import torch.nn as nn 
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("using device: ", device)

using device:  cuda


In [114]:
import torch.optim as optim

criteria_sentiment = nn.CrossEntropyLoss()
criteria_sarcasm = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [75]:
class SentimentSarcasmModel(nn.Module):
    def __init__(self, nb_words, embedding_dim):
        super(SentimentSarcasmModel, self).__init__()

        # Make our embedding layers
        self.embedding1 = nn.Embedding(nb_words, embedding_dim)
        self.embedding1.weight.requires_grad = True
        self.embedding2 = nn.Embedding(nb_words, embedding_dim)
        self.embedding2.weight.requires_grad = True

        # LSTM layers
        self.lstm1 = nn.LSTM(embedding_dim, embedding_dim, num_layers=2, batch_first=True, bidirectional=True)
        self.lstm2 = nn.LSTM(embedding_dim, embedding_dim, num_layers=2, batch_first=True, bidirectional=True)

        # Sentiment inner branch hidden layers
        self.fc1_1 = nn.Linear(embedding_dim * 2, 256)  # Changed from 20 * 2 to 128 * 2
        self.drop1_1 = nn.Dropout(0.4)
        self.fc1_2 = nn.Linear(256, 32)
        self.drop1_2 = nn.Dropout(0.4)
        self.fc1_3 = nn.Linear(32, 3)

        # Sarcasm inner branch hidden layers
        self.fc2_1 = nn.Linear(embedding_dim * 2, 256)  # Changed from 20 * 2 to 128 * 2
        self.drop2_1 = nn.Dropout(0.4)
        self.fc2_2 = nn.Linear(256, 32)
        self.drop2_2 = nn.Dropout(0.4)
        self.fc2_3 = nn.Linear(32, 2)

    def forward(self, x1, x2):
        x1 = self.embedding1(x1)
        x2 = self.embedding2(x2)

        x1, _ = self.lstm1(x1)
        x2, _ = self.lstm2(x2)

        # Use the output of the last time step for each sequence
        x1 = x1[:, -1, :]
        x2 = x2[:, -1, :]

        x1 = F.relu(self.fc1_1(x1))
        x1 = self.drop1_1(x1)
        x1 = F.relu(self.fc1_2(x1))
        x1 = self.drop1_2(x1)
        x1 = self.fc1_3(x1)

        x2 = F.relu(self.fc2_1(x2))
        x2 = self.drop2_1(x2)
        x2 = F.relu(self.fc2_2(x2))
        x2 = self.drop2_2(x2)
        x2 = self.fc2_3(x2)

        return x1, x2

In [76]:
import torch
import torch.nn.functional as F

# Assuming model is already defined and device is set
model = SentimentSarcasmModel(nb_words=max_words, embedding_dim=128).to(device)
model.train()

# Define number of epochs
num_epochs = 10
batch_size = 32

Y_train_sentiment = y_train_sentiment.long()
Y_train_sarcasm = y_train_sarcasm.long()

num_samples_sentiment = X_train_sentiment_ids.size(0)
indices_sentiment = torch.randperm(num_samples_sentiment)

num_samples_sarcasm = X_train_sarcasm_ids.size(0)
indices_sarcasm = torch.randperm(num_samples_sarcasm)

prev_gradient = None

for epoch in range(num_epochs):
    total_loss_sentiment = 0.0
    total_loss_sarcasm = 0.0
    
    for i in range(0, num_samples_sentiment, batch_size):
        batch_indices = indices_sentiment[i:i+batch_size]
        batch_sentiment = X_train_sentiment_ids[batch_indices].to(device)
        labels_sentiment = y_train_sentiment[batch_indices].to(device)

        output_sentiment, _ = model(batch_sentiment, batch_sarcasm)

        labels_sentiment = labels_sentiment.long()

        loss_sentiment = criteria_sentiment(output_sentiment, labels_sentiment)

        optimizer.zero_grad()
        loss_sentiment.backward()
        optimizer.step()

        total_loss_sentiment += loss_sentiment.item()
    
    for i in range(0, num_samples_sarcasm, batch_size):
        batch_indices = indices_sarcasm[i:i+batch_size]
        batch_sarcasm = X_train_sarcasm_ids[batch_indices].to(device)
        labels_sarcasm = y_train_sarcasm[batch_indices].to(device)

        _, output_sarcasm = model(batch_sentiment, batch_sarcasm)

        labels_sarcasm = labels_sarcasm.long()

        loss_sarcasm = criteria_sarcasm(output_sarcasm, labels_sarcasm)

        optimizer.zero_grad()
        loss_sarcasm.backward()
        optimizer.step()

        total_loss_sarcasm += loss_sarcasm.item()

    # Print average losses for the epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss Sentiment: {total_loss_sentiment/num_samples_sentiment:.4f}, Loss Sarcasm: {total_loss_sarcasm/num_samples_sarcasm:.4f}')



Epoch [1/10], Loss Sentiment: 0.0344, Loss Sarcasm: 0.0218
Epoch [2/10], Loss Sentiment: 0.0344, Loss Sarcasm: 0.0218
Epoch [3/10], Loss Sentiment: 0.0344, Loss Sarcasm: 0.0218


KeyboardInterrupt: 

In [45]:
model.eval()  # Set the model to evaluation mode

batch_size = 32
num_samples_sentiment = X_test_sentiment_ids.size(0)
num_samples_sarcasm = X_test_sarcasm_ids.size(0)

# Initialize counters for correct predictions
correct_sentiment = 0
correct_sarcasm = 0

with torch.no_grad():
    # Iterate over the sentiment test data in batches
    for i in range(0, num_samples_sentiment, batch_size):
        batch_indices = slice(i, min(i + batch_size, num_samples_sentiment))
        batch_sentiment = X_test_sentiment_ids[batch_indices].to(device)
        labels_sentiment = y_test_sentiment[batch_indices].to(device)

        output_sentiment, _ = model(batch_sentiment, torch.zeros_like(batch_sentiment))  # Assuming mock input for sarcasm
        _, predicted_sentiment = torch.max(output_sentiment, 1)
        correct_sentiment += (predicted_sentiment == labels_sentiment).sum().item()

    # Iterate over the sarcasm test data in batches
    for i in range(0, num_samples_sarcasm, batch_size):
        batch_indices = slice(i, min(i + batch_size, num_samples_sarcasm))
        batch_sarcasm = X_test_sarcasm_ids[batch_indices].to(device)
        labels_sarcasm = y_test_sarcasm[batch_indices].to(device)

        _, output_sarcasm = model(torch.zeros_like(batch_sarcasm), batch_sarcasm)  # Assuming mock input for sentiment
        _, predicted_sarcasm = torch.max(output_sarcasm, 1)
        correct_sarcasm += (predicted_sarcasm == labels_sarcasm).sum().item()

# Calculate accuracies
accuracy_sentiment = correct_sentiment / num_samples_sentiment
accuracy_sarcasm = correct_sarcasm / num_samples_sarcasm

print(f'Accuracy Sentiment: {accuracy_sentiment:.4f}, Accuracy Sarcasm: {accuracy_sarcasm:.4f}')


Accuracy Sentiment: 0.3328, Accuracy Sarcasm: 0.5021
