In [2]:
import pandas as pd
import numpy as np 
import matplotlib as plt 

tweets = pd.read_csv('combined.csv', encoding='utf-8')

Deleting irrelevant columns

In [3]:
tweets = tweets.drop(['tweet_date_created'], axis=1)
tweets = tweets.drop(['sentiment_score'], axis=1)

Checking for duplicates and deleting them

In [4]:
duplicates = tweets[tweets.duplicated(subset=['tweet_id'], keep=False)]

if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate tweet ids. Removing duplicates...")
    tweets.drop_duplicates(subset=['tweet_id'], inplace=True)
else:
    print("No duplicate tweet ids found.")

Found 762643 duplicate tweet ids. Removing duplicates...


In [5]:
tweets = tweets.drop(['tweet_id'], axis=1)

Deleting NULLS

In [6]:
tweets = tweets.dropna()
print(tweets.isnull().sum())

tweet_text    0
language      0
sentiment     0
dtype: int64


Language Check

In [7]:
all_english = (tweets['language'] == 'en').all()

if all_english:
    print("All values in the 'language' column are 'en'")
else:
    print("Not all values in the 'language' column are 'en'")

All values in the 'language' column are 'en'


In [8]:
tweets = tweets.drop(['language'], axis=1)

Sentiment Map 

In [9]:
sentiment_map = {"NEGATIVE": 0, "POSITIVE": 1, "NEUTRAL": 2, "MIXED": 3}

# Map the sentiment labels to their numeric values
tweets['sentiment_values'] = tweets['sentiment'].map(sentiment_map)

# Print the new column that contains the mapped values
print(tweets['sentiment_values'])
tweets.head(5)

0          2
1          2
2          2
3          0
4          2
          ..
5393957    2
5393958    0
5393959    2
5393960    2
5393961    2
Name: sentiment_values, Length: 5012534, dtype: int64


Unnamed: 0,tweet_text,sentiment,sentiment_values
0,Bayer Leverkusen goalkeeper Bernd Leno will no...,NEUTRAL,2
1,Gary Speed v Blackburn at St James in 2001/02 ...,NEUTRAL,2
2,@ChelseaFC Don't make him regret it and start ...,NEUTRAL,2
3,"@LiverpoolFF @AnfieldEdition He's a liar, made...",NEGATIVE,0
4,@theesk @Everton Didn't realise Kenwright is d...,NEUTRAL,2


In [10]:
# Find the indices of rows with Mixed sentiment
mixed_indices = tweets[tweets['sentiment_values'] == 3].index
tweets = tweets.drop(mixed_indices)

In [11]:
negative_tweets = tweets[tweets['sentiment_values'] == 0]
positive_tweets = tweets[tweets['sentiment_values'] == 1]
neutral_tweets = tweets[tweets['sentiment_values'] == 2]

print('No of positive tagged tweets is: {}'.format(len(positive_tweets)))
print('No of negative tagged tweets is: {}'.format(len(negative_tweets)))
print('No of neutral tagged tweets is: {}'.format(len(neutral_tweets)))

No of positive tagged tweets is: 1070334
No of negative tagged tweets is: 354501
No of neutral tagged tweets is: 3549918


In [12]:
import pandas as pd

# Shuffle the DataFrame to ensure that the downsampling is random
tweets = tweets.sample(frac=1, random_state=42)

# Count the number of tweets in each sentiment class
counts = tweets['sentiment_values'].value_counts()

# Find the smallest class size
smallest_size = counts.min()

# Downsample each class to the smallest size
positive_tweets = tweets[tweets['sentiment_values'] == 1].sample(n=smallest_size, random_state=42)
negative_tweets = tweets[tweets['sentiment_values'] == 0].sample(n=smallest_size, random_state=42)
neutral_tweets = tweets[tweets['sentiment_values'] == 2].sample(n=smallest_size, random_state=42)

# Concatenate the downsampled DataFrames
tweets = pd.concat([positive_tweets, negative_tweets, neutral_tweets], ignore_index=True)

# Print the new counts of tweets in each class
print('No of positive tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 1])))
print('No of negative tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 0])))
print('No of neutral tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 2])))


No of positive tagged tweets is: 354501
No of negative tagged tweets is: 354501
No of neutral tagged tweets is: 354501


In [13]:
tweets['tweet_text'] = tweets['tweet_text'].astype('str')

In [14]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", 
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
     "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [16]:
# Create a function to replace abbreviations
def replace_abbreviations(text):
    words = text.split()
    new_words = []
    for word in words:
        if word in abbreviations:
            new_words.append(abbreviations[word])
        else:
            new_words.append(word)
    return ' '.join(new_words)

In [14]:
import re
import string 
import pandas as pd
import nltk 
from nltk.corpus import stopwords

english_stopwords = stopwords.words('english')

# Add custom stopwords
custom_stopwords = ['dont', 'shouldve', 'arent', 'couldnt', 'didnt', 'doesnt', 'hadnt', 'havent', 'mustnt', 'shouldnt', 'wasnt', 'werent', 
                    'wont', 'wouldnt']
english_stopwords.extend(custom_stopwords)

def preprocess_text(text):
    # Get the default NLTK English stopwords
    
    # List of words to keep
    #words_to_keep = {"off", "over", "under", "few", "more", "no", "not", "don't", "should", "should've", "aren't", 
    #                 "couldn't", "didn't", "doesn't", "hadn't", "haven't", "mustn't", "shouldn't", "wasn't", "weren't",
    #                 "won't", "wouldn't"}
    # Create a custom stopwords list
    #custom_stopwords = default_stopwords - words_to_keep
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove whitespace
    text = text.strip()
    # Remove custom stopwords and join the words in a single string
    text = ' '.join([word for word in text.split() if word not in english_stopwords])
    
    return text


In [14]:
import re
from nltk.corpus import stopwords

def preprocess_text(text):
    # List of words to keep
    #words_to_keep = {"off", "over", "under", "few", "more", "no", "not", "don't", "should", "should've", "aren't", 
    #                 "couldn't", "didn't", "doesn't", "hadn't", "haven't", "mustn't", "shouldn't", "wasn't", "weren't",
    #                 "won't", "wouldn't"}
    # Create a custom stopwords list
    #custom_stopwords = default_stopwords - words_to_keep
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove whitespace
    text = text.strip()
    
    return text


In [15]:
import pandas as pd
from sklearn.utils import shuffle

# Apply the preprocessing function to the 'text' column
tweets['processed_text'] = tweets['tweet_text'].apply(preprocess_text)

tweets =  shuffle(tweets).reset_index(drop=True)

In [16]:
# Filter the rows containing the word 'not' in the 'processed_text' column
rows_with_not = tweets.loc[tweets['processed_text'].str.contains(r'\bwont\b', regex=True)]

# Print the rows containing the word 'not'
print(rows_with_not)


Empty DataFrame
Columns: [tweet_text, sentiment, sentiment_values, processed_text]
Index: []


In [15]:
tweets.head(5)

Unnamed: 0,tweet_text,sentiment,sentiment_values,processed_text
0,@Olatu125 @Amazing02082759 @67Kelechi We wanna...,POSITIVE,1,we wanna see their beautiful jerseys how many ...
1,People need to realise what @JesseLingard brin...,NEUTRAL,2,people need to realise what brings to united ...
2,At the end of the EPL season some players went...,NEGATIVE,0,at the end of the epl season some players went...
3,Ready to roll - #SilkBrowser #AmazonFireStick ...,POSITIVE,1,ready to roll \n\nthe boys are in action th...
4,@LCFC TOTALLY DISAPPOINTED THAT YOU HAVE REFU...,NEGATIVE,0,totally disappointed that you have refused wes...


In [15]:
def remove_nan_na(text):
    # Remove "nan" and "na" and join the words in a single string
    text = ' '.join([word for word in text.split() if word not in ('nan', 'na', 'n/')])
    return text


In [16]:
tweets = tweets.drop(['sentiment'], axis=1)

In [16]:
X = tweets['processed_text']
y = tweets['sentiment_values']

In [18]:
import pandas as pd

total_rows = tweets.shape[0]
print("Total number of rows:", total_rows)

Total number of rows: 1063503


In [19]:
import numpy as np

tweet_lengths = tweets['processed_text'].apply(lambda x: len(x.split()))
mean_length = np.mean(tweet_lengths)
std_length = np.std(tweet_lengths)

print("Average tweet length:", mean_length)
print("Standard deviation of tweet length:", std_length)

Average tweet length: 11.030035646349846
Standard deviation of tweet length: 6.623578287701834


In [17]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV


In [18]:
max_features = 30000
max_length = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_length)


In [19]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split the dataset into 70% training and 30% combined validation and testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary dataset (30% of the entire dataset) into 50% validation and 50% testing
# This results in 15% validation and 15% testing of the entire dataset
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training data size:", len(X_train))
print("Validation data size:", len(X_val))
print("Testing data size:", len(X_test))


Training data size: 744452
Validation data size: 159525
Testing data size: 159526


GPU with LSTM Implementation

In [28]:
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset, TensorDataset
# from sklearn.metrics import accuracy_score

# # Load GloVe embeddings
# def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
#     embeddings_index = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
    
#     embedding_matrix = np.zeros((max_features, embedding_dim))
#     for word, i in word_index.items():
#         if i >= max_features:
#             continue
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector
#     return torch.FloatTensor(embedding_matrix)

# # Load the GloVe embeddings matrix
# glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
# embedding_dim = 200
# embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

# # Define the LSTM model with GloVe embeddings
# class LSTMModel(nn.Module):
#     def __init__(self, embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.2, num_classes=3):
#         super().__init__()
#         self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
#         self.lstm = nn.LSTM(embed_dim, lstm_out, bidirectional=True, batch_first=True, dropout=dropout_rate)
#         self.fc = nn.Linear(lstm_out * 2, num_classes)
#         self.dropout = nn.Dropout(dropout_rate)
        
#     def forward(self, x):
#         x = self.embedding(x)
#         x, _ = self.lstm(x)
#         x = self.dropout(x[:, -1, :])  # Get the last hidden state of the LSTM
#         x = self.fc(x)
#         return x

# # Create and train the model with GloVe embeddings
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = LSTMModel(embedding_matrix).to(device)

# X_train_torch = torch.LongTensor(X_train).to(device)
# y_train_torch = y_train.to(torch.int64).to(device)  # Change data type to int64
# X_test_torch = torch.LongTensor(X_test).to(device)
# y_test_torch = y_test.to(torch.int64).to(device)  # Change data type to int64

# train_data = TensorDataset(X_train_torch, y_train_torch)
# train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)

# optimizer = optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss()

# num_epochs = 5
# accumulation_steps = 4  # Adjust this value based on your needs
# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for i, (batch_X, batch_y) in enumerate(train_dataloader):
#         optimizer.zero_grad()
#         outputs = model(batch_X)
#         loss = criterion(outputs, batch_y) / accumulation_steps  # Normalize the loss
#         loss.backward()
        
#         # Accumulate gradients and update weights every accumulation_steps
#         if (i + 1) % accumulation_steps == 0:
#             optimizer.step()
#             optimizer.zero_grad()

#         running_loss += loss.item() * accumulation_steps 
#     print(f"Epoch {epoch+1}/{num_epochs}, Step [{i + 1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")


# # Create DataLoader for the test set
# test_data = TensorDataset(X_test_torch, y_test_torch)
# test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)

# # Evaluate the model on the test set
# model.eval()
# y_pred = []
# y_true = []

# with torch.no_grad():
#     for batch_X, batch_y in test_dataloader:
#         test_outputs = model(batch_X)
#         _, batch_pred = torch.max(test_outputs, 1)
#         y_pred.extend(batch_pred.cpu().numpy())
#         y_true.extend(batch_y.cpu().numpy())

# test_accuracy = accuracy_score(y_true, y_pred)
# print("Test accuracy:", test_accuracy)

Specified Hyper-parameters Implementation

In [24]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import optuna

num_epochs = 5
batch_size = 128
accumulation_steps = 4 

# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Load the GloVe embeddings matrix
glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
embedding_dim = 200
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

class CNNTLSTM(nn.Module):
    def __init__(self, embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.2, num_classes=3, num_filters=64, filter_size=5, pool_size=2):
        super(CNNTLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
        self.relu = nn.ReLU()
        self.max_pool1d = nn.MaxPool1d(pool_size)
        self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(lstm_out, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.max_pool1d(x)
        x = x.permute(0, 2, 1)
        x, _ = self.bi_lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return self.softmax(x)

X_train_torch = torch.tensor(X_train, dtype=torch.long)
y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
X_val_torch = torch.tensor(X_val, dtype=torch.long)
y_val_torch = torch.tensor(y_val.values, dtype=torch.long)
X_test_torch = torch.tensor(X_test, dtype=torch.long)
y_test_torch = torch.tensor(y_test.values, dtype=torch.long)

y_train_np = y_train.values.reshape(-1, 1)
y_val_np = y_val.values.reshape(-1, 1)  # Added this line
y_test_np = y_test.values.reshape(-1, 1)


hyperparameters_accuracies = []

def objective(trial):
    # Hyperparameters to be optimized
    num_filters = int(trial.suggest_discrete_uniform("num_filters", 64, 256, 64))
    filter_size = 3
    pool_size = int(trial.suggest_discrete_uniform("pool_size", 2, 4, 2))
    lstm_out = int(trial.suggest_discrete_uniform("lstm_out", 64, 512, 64))
    dropout_rate = trial.suggest_float("dropout_rate", 0.2, 0.4, step=0.1)


    model = CNNTLSTM(embedding_matrix, embed_dim=embedding_dim, lstm_out=lstm_out, dropout_rate=dropout_rate, num_classes=3, num_filters=num_filters, filter_size=filter_size, pool_size=pool_size).cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # Define DataLoader within the objective function
    train_dataset = TensorDataset(X_train_torch, y_train_torch)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Train the model with the given hyperparameters
    for epoch in range(num_epochs):
        for i, (texts, labels) in enumerate(train_loader):
            texts, labels = texts.cuda(), labels.cuda()

            # Forward pass
            outputs = model(texts)
            loss = criterion(outputs, labels) / accumulation_steps

            # Backward and optimize
            loss.backward()
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

    # Evaluate the model on the validation set
    model.eval()
    val_dataset = TensorDataset(X_val_torch, y_val_torch)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    all_val_predictions = []
    all_val_labels = []

    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.cuda(), labels.cuda()
            val_outputs = model(texts)
            _, predicted = torch.max(val_outputs.data, 1)
            all_val_predictions.extend(predicted.cpu().numpy())
            all_val_labels.extend(labels.cpu().numpy())

    # Calculate the validation accuracy
    val_accuracy = accuracy_score(y_val_np, all_val_predictions)

    # Print the accuracy for each set of hyperparameters
    print(f"Validation accuracy: {val_accuracy:.4f} with hyperparameters: {trial.params}")

    # Append the accuracy and hyperparameters to the list
    hyperparameters_accuracies.append((val_accuracy, trial.params))

    return val_accuracy

    # Optimize using Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=96)

    # Print the best hyperparameters
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
        print(f"    {key}: {value}")


  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2023-04-29 13:29:50,226][0m A new study created in memory with name: no-name-b55de5ec-41d2-4d36-aa85-1d366c3a31c1[0m
  num_filters = int(trial.suggest_discrete_uniform("num_filters", 64, 256, 64))
  pool_size = int(trial.suggest_discrete_uniform("pool_size", 2, 4, 2))
  lstm_out = int(trial.suggest_discrete_uniform("lstm_out", 64, 512, 64))
[33m[W 2023-04-29 13:30:36,946][0m Trial 0 failed with parameters: {'num_filters': 256.0, 'pool_size': 4.0, 'lstm_out': 64.0, 'dropout_rate': 0.2} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\clayt\AppData\Local\Temp\ipykernel_21376\364108414.py", line 99, in objective
    texts, labels = texts.cuda(), labels.cuda()
KeyboardInterrupt
[33m[W 2023-04-29 13:30:36,990][0m Trial 0

KeyboardInterrupt: 

Hyper-parameters Implementation

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import optuna

num_epochs = 5
batch_size = 128
accumulation_steps = 4 

# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Load the GloVe embeddings matrix
glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
embedding_dim = 200
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

class CNNTLSTM(nn.Module):
    def __init__(self, embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.2, num_classes=3, num_filters=64, filter_size=5, pool_size=2):
        super(CNNTLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
        self.relu = nn.ReLU()
        self.max_pool1d = nn.MaxPool1d(pool_size)
        self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(lstm_out, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.max_pool1d(x)
        x = x.permute(0, 2, 1)
        x, _ = self.bi_lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return self.softmax(x)

X_train_torch = torch.tensor(X_train, dtype=torch.long)
y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
X_val_torch = torch.tensor(X_val, dtype=torch.long)
y_val_torch = torch.tensor(y_val.values, dtype=torch.long)
X_test_torch = torch.tensor(X_test, dtype=torch.long)
y_test_torch = torch.tensor(y_test.values, dtype=torch.long)

y_train_np = y_train.values.reshape(-1, 1)
y_val_np = y_val.values.reshape(-1, 1)  # Added this line
y_test_np = y_test.values.reshape(-1, 1)


hyperparameters_accuracies = []

def objective(trial):
    # Hyperparameters to be optimized
    num_filters = int(trial.suggest_discrete_uniform("num_filters", 32, 256, 32))
    filter_size = int(trial.suggest_discrete_uniform("filter_size", 3, 7, 2))
    pool_size = int(trial.suggest_discrete_uniform("pool_size", 2, 4, 2))
    lstm_out = int(trial.suggest_discrete_uniform("lstm_out", 64, 512, 64))
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.4, step=0.1)

    model = CNNTLSTM(embedding_matrix, embed_dim=embedding_dim, lstm_out=lstm_out, dropout_rate=dropout_rate, num_classes=3, num_filters=num_filters, filter_size=filter_size, pool_size=pool_size).cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # Define DataLoader within the objective function
    train_dataset = TensorDataset(X_train_torch, y_train_torch)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Train the model with the given hyperparameters
    for epoch in range(num_epochs):
        for i, (texts, labels) in enumerate(train_loader):
            texts, labels = texts.cuda(), labels.cuda()

            # Forward pass
            outputs = model(texts)
            loss = criterion(outputs, labels) / accumulation_steps

            # Backward and optimize
            loss.backward()
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

    # Evaluate the model on the validation set
    model.eval()
    val_dataset = TensorDataset(X_val_torch, y_val_torch)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    all_val_predictions = []
    all_val_labels = []

    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.cuda(), labels.cuda()
            val_outputs = model(texts)
            _, predicted = torch.max(val_outputs.data, 1)
            all_val_predictions.extend(predicted.cpu().numpy())
            all_val_labels.extend(labels.cpu().numpy())

    # Calculate the validation accuracy
    val_accuracy = accuracy_score(y_val_np, all_val_predictions)

    # Print the accuracy for each set of hyperparameters
    print(f"Validation accuracy: {val_accuracy:.4f} with hyperparameters: {trial.params}")

    # Append the accuracy and hyperparameters to the list
    hyperparameters_accuracies.append((val_accuracy, trial.params))

    return val_accuracy

    # Optimize using Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

    # Print the best hyperparameters
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
        print(f"    {key}: {value}")


Validation

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Load the GloVe embeddings matrix
glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
embedding_dim = 200
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

class CNNTLSTM(nn.Module):
    def __init__(self, embedding_matrix, embed_dim=200, lstm_out=448, dropout_rate=0.2, num_classes=3, num_filters=256, filter_size=3, pool_size=2):
        super(CNNTLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
        self.relu = nn.ReLU()
        self.max_pool1d = nn.MaxPool1d(pool_size)
        self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(lstm_out, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.max_pool1d(x)
        x = x.permute(0, 2, 1)
        x, _ = self.bi_lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return self.softmax(x)
    
y_train_np = y_train.values.reshape(-1, 1)
y_val_np = y_val.values.reshape(-1, 1)

# Prepare data for PyTorch
X_train_torch = torch.tensor(X_train, dtype=torch.long)
y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
X_val_torch = torch.tensor(X_val, dtype=torch.long)
y_val_torch = torch.tensor(y_val.values, dtype=torch.long)

# Initialize model, loss function and optimizer
model = CNNTLSTM(embedding_matrix).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train the model
num_epochs = 5
batch_size = 128

all_predictions = []
all_labels = []

train_dataset = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

accumulation_steps = 4  # Adjust this value based on your GPU memory capacity

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (texts, labels) in enumerate(train_loader):
        texts, labels = texts.cuda(), labels.cuda()

        # Forward pass
        outputs = model(texts)
        loss = criterion(outputs, labels) / accumulation_steps

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == labels).sum().item() / labels.size(0)

        # Backward and optimize
        loss.backward()
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        running_loss += loss.item() * accumulation_steps

        if (i + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")

# Evaluate the model on the test set
model.eval()
test_dataset = TensorDataset(X_val_torch, y_val_torch)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)  # Set a smaller batch size for test dat| qZ\ASWQ1

total_correct = 0 
total_samples = 0

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.cuda(), labels.cuda()
        test_outputs = model(texts)
        _, predicted = torch.max(test_outputs.data, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

test_accuracy = total_correct / total_samples
print(f"Test accuracy: {test_accuracy:.4f}")

In [24]:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(all_labels, all_predictions))

print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_predictions))

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.84     53216
           1       0.84      0.80      0.82     53201
           2       0.76      0.74      0.75     53108

    accuracy                           0.80    159525
   macro avg       0.80      0.80      0.80    159525
weighted avg       0.80      0.80      0.80    159525

Confusion Matrix:
[[45845  2064  5307]
 [ 3191 42577  7433]
 [ 7522  6109 39477]]


In [8]:
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset

# Save the trained model
model_save_path = "cnntlstm_modelSP.pth"  # Choose the path where you want to save the model
torch.save(model.state_dict(), model_save_path)


NameError: name 'model' is not defined

TO DO MODEL WITH VALIDATION - DO NOT USE ONLY ONCE

In [24]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Load the GloVe embeddings matrix
glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
embedding_dim = 200
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

class CNNTLSTM(nn.Module):
    def __init__(self, embedding_matrix, embed_dim=200, lstm_out=384, dropout_rate=0.2, num_classes=3, num_filters=128, filter_size=3, pool_size=2):
        super(CNNTLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
        self.relu = nn.ReLU()
        self.max_pool1d = nn.MaxPool1d(pool_size)
        self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(lstm_out, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.max_pool1d(x)
        x = x.permute(0, 2, 1)
        x, _ = self.bi_lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return self.softmax(x)
    
y_train_np = y_train.values.reshape(-1, 1)
y_test_np = y_test.values.reshape(-1, 1)

# Prepare data for PyTorch
X_train_torch = torch.tensor(X_train, dtype=torch.long)
y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
X_test_torch = torch.tensor(X_val, dtype=torch.long)
y_test_torch = torch.tensor(y_val.values, dtype=torch.long)

# Initialize model, loss function and optimizer
model = CNNTLSTM(embedding_matrix).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train the model
num_epochs = 5
batch_size = 128

train_dataset = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

accumulation_steps = 4  # Adjust this value based on your GPU memory capacity

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (texts, labels) in enumerate(train_loader):
        texts, labels = texts.cuda(), labels.cuda()

        # Forward pass
        outputs = model(texts)
        loss = criterion(outputs, labels) / accumulation_steps

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == labels).sum().item() / labels.size(0)

        # Backward and optimize
        loss.backward()
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        running_loss += loss.item() * accumulation_steps

        if (i + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")

# Evaluate the model on the test set
model.eval()
test_dataset = TensorDataset(X_test_torch, y_test_torch)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)  # Set a smaller batch size for test dat| qZ\ASWQ1

total_correct = 0 
total_samples = 0

all_predictions = []
all_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.cuda(), labels.cuda()
        test_outputs = model(texts)
        _, predicted = torch.max(test_outputs.data, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

test_accuracy = total_correct / total_samples
print(f"Test accuracy: {test_accuracy:.4f}")



Epoch [1/5], Step [10/5817], Loss: 0.2738, Accuracy: 0.3672
Epoch [1/5], Step [20/5817], Loss: 0.2725, Accuracy: 0.3750
Epoch [1/5], Step [30/5817], Loss: 0.2704, Accuracy: 0.5312
Epoch [1/5], Step [40/5817], Loss: 0.2646, Accuracy: 0.6016
Epoch [1/5], Step [50/5817], Loss: 0.2570, Accuracy: 0.5156
Epoch [1/5], Step [60/5817], Loss: 0.2457, Accuracy: 0.6250
Epoch [1/5], Step [70/5817], Loss: 0.2299, Accuracy: 0.6328
Epoch [1/5], Step [80/5817], Loss: 0.2286, Accuracy: 0.6484
Epoch [1/5], Step [90/5817], Loss: 0.2314, Accuracy: 0.6406
Epoch [1/5], Step [100/5817], Loss: 0.2071, Accuracy: 0.7422
Epoch [1/5], Step [110/5817], Loss: 0.2217, Accuracy: 0.6562
Epoch [1/5], Step [120/5817], Loss: 0.2051, Accuracy: 0.7266
Epoch [1/5], Step [130/5817], Loss: 0.2266, Accuracy: 0.6328
Epoch [1/5], Step [140/5817], Loss: 0.2257, Accuracy: 0.6250
Epoch [1/5], Step [150/5817], Loss: 0.1993, Accuracy: 0.7656
Epoch [1/5], Step [160/5817], Loss: 0.2096, Accuracy: 0.6875
Epoch [1/5], Step [170/5817], Los

GPU with CNN+LSTM Implementation

In [21]:
tweets =  shuffle(tweets).reset_index(drop=True)

In [24]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Load the GloVe embeddings matrix
glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
embedding_dim = 200
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

class CNNTLSTM(nn.Module):
    def __init__(self, embedding_matrix, embed_dim=200, lstm_out=448, dropout_rate=0.2, num_classes=3, num_filters=256, filter_size=3, pool_size=2):
        super(CNNTLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
        self.relu = nn.ReLU()
        self.max_pool1d = nn.MaxPool1d(pool_size)
        self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(lstm_out, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.max_pool1d(x)
        x = x.permute(0, 2, 1)
        x, _ = self.bi_lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return self.softmax(x)
    
y_train_np = y_train.values.reshape(-1, 1)
y_test_np = y_test.values.reshape(-1, 1)

# Prepare data for PyTorch
X_train_torch = torch.tensor(X_train, dtype=torch.long)
y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
X_test_torch = torch.tensor(X_test, dtype=torch.long)
y_test_torch = torch.tensor(y_test.values, dtype=torch.long)

# Initialize model, loss function and optimizer
model = CNNTLSTM(embedding_matrix).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train the model
num_epochs = 10
batch_size = 128

train_dataset = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

accumulation_steps = 4  # Adjust this value based on your GPU memory capacity

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (texts, labels) in enumerate(train_loader):
        texts, labels = texts.cuda(), labels.cuda()

        # Forward pass
        outputs = model(texts)
        loss = criterion(outputs, labels) / accumulation_steps

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        accuracy = (predicted == labels).sum().item() / labels.size(0)

        # Backward and optimize
        loss.backward()
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        running_loss += loss.item() * accumulation_steps

        if (i + 1) % 10 == 0:
            print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")

# Evaluate the model on the test set
model.eval()
test_dataset = TensorDataset(X_test_torch, y_test_torch)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)  # Set a smaller batch size for test dat| qZ\ASWQ1

total_correct = 0 
total_samples = 0

all_predictions = []
all_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.cuda(), labels.cuda()
        test_outputs = model(texts)
        _, predicted = torch.max(test_outputs.data, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

test_accuracy = total_correct / total_samples
print(f"Test accuracy: {test_accuracy:.4f}")



Epoch [1/10], Step [10/5817], Loss: 0.2746, Accuracy: 0.3750
Epoch [1/10], Step [20/5817], Loss: 0.2687, Accuracy: 0.5625
Epoch [1/10], Step [30/5817], Loss: 0.2639, Accuracy: 0.4922
Epoch [1/10], Step [40/5817], Loss: 0.2465, Accuracy: 0.6172
Epoch [1/10], Step [50/5817], Loss: 0.2307, Accuracy: 0.6172
Epoch [1/10], Step [60/5817], Loss: 0.2248, Accuracy: 0.6484
Epoch [1/10], Step [70/5817], Loss: 0.2114, Accuracy: 0.6797
Epoch [1/10], Step [80/5817], Loss: 0.2197, Accuracy: 0.6484
Epoch [1/10], Step [90/5817], Loss: 0.2161, Accuracy: 0.6641
Epoch [1/10], Step [100/5817], Loss: 0.2060, Accuracy: 0.7422
Epoch [1/10], Step [110/5817], Loss: 0.2109, Accuracy: 0.7266
Epoch [1/10], Step [120/5817], Loss: 0.2088, Accuracy: 0.7266
Epoch [1/10], Step [130/5817], Loss: 0.2004, Accuracy: 0.7422
Epoch [1/10], Step [140/5817], Loss: 0.2107, Accuracy: 0.6797
Epoch [1/10], Step [150/5817], Loss: 0.1979, Accuracy: 0.7578
Epoch [1/10], Step [160/5817], Loss: 0.2148, Accuracy: 0.6719
Epoch [1/10], Ste

In [25]:
# Calculate the classification report and confusion matrix
report = classification_report(all_labels, all_predictions)
cm = confusion_matrix(all_labels, all_predictions)

# Print the classification report and confusion matrix
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(cm)


Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87     53286
           1       0.86      0.84      0.85     53235
           2       0.80      0.76      0.77     53005

    accuracy                           0.83    159526
   macro avg       0.83      0.83      0.83    159526
weighted avg       0.83      0.83      0.83    159526

Confusion Matrix:
[[48012  1026  4248]
 [ 2235 44938  6062]
 [ 6924  6051 40030]]


In [26]:
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset

# Save the trained model
model_save_path = "cnntlstm_modelSP6.pth"  # Choose the path where you want to save the model
torch.save(model.state_dict(), model_save_path)


In [28]:
import torch
torch.cuda.empty_cache()


In [44]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to load a saved model
def load_saved_model(model_path):
    model = CNNTLSTM(embedding_matrix).cuda()
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

# Read the CSV file
csv_file_path = 'WOLVESNP.csv'
df = pd.read_csv(csv_file_path)

df['processed_text'] = df['processed_text'].astype(str)

# Tokenize and pad the preprocessed texts
tokenized_texts = tokenizer.texts_to_sequences(df['processed_text'])
padded_texts = pad_sequences(tokenized_texts, maxlen=100)

# Convert the tokenized and padded texts to PyTorch tensors
texts_torch = torch.tensor(padded_texts, dtype=torch.long)

# Load the saved model
model_path = 'cnntlstm_modelSP6.pth'
loaded_model = load_saved_model(model_path)

# Use the model to predict sentiment labels for the text data
label_map = {0: "NEGATIVE", 1: "POSITIVE", 2: "NEUTRAL"}

# Process the data in smaller batches
batch_size = 128
num_batches = len(texts_torch) // batch_size + (len(texts_torch) % batch_size > 0)
predictions = []

for i in range(num_batches):
    batch_start = i * batch_size
    batch_end = min((i + 1) * batch_size, len(texts_torch))
    
    with torch.no_grad():
        batch_texts = texts_torch[batch_start:batch_end].cuda()
        output_probs = loaded_model(batch_texts)
        _, batch_predictions = torch.max(output_probs.data, 1)
        batch_predictions = batch_predictions.cpu().numpy()
        predictions.extend(batch_predictions)

# Convert the predicted labels back to their string representations
predicted_labels = [label_map[pred] for pred in predictions]

# Add the predicted labels to the original DataFrame and save it to a new CSV file
df['predicted_sentiment'] = predicted_labels
df.to_csv('WOLVESNPg.csv', index=False)




Testing the model

In [None]:
# import pandas as pd
# import torch
# from torch.utils.data import DataLoader, Dataset, TensorDataset
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Function to load a saved model
# def load_saved_model(model_path):
#     model = CNNTLSTM(embedding_matrix).cuda()
#     model.load_state_dict(torch.load(model_path))
#     model.eval()
#     return model

# # Read the CSV file
# csv_file_path = 'SampleTweets.csv'
# df = pd.read_csv(csv_file_path)

# # Tokenize and pad the preprocessed texts
# tokenized_texts = tokenizer.texts_to_sequences(df['processed_text'])
# padded_texts = pad_sequences(tokenized_texts, maxlen=100)

# # Convert the tokenized and padded texts to PyTorch tensors
# texts_torch = torch.tensor(padded_texts, dtype=torch.long)

# # Load the saved model
# model_path = 'cnntlstm_model3.pth'
# loaded_model = load_saved_model(model_path)

# # Use the model to predict sentiment labels for the text data
# label_map = {0: "NEGATIVE", 1: "POSITIVE", 2: "NEUTRAL"}

# with torch.no_grad():
#     texts_torch = texts_torch.cuda()
#     output_probs = loaded_model(texts_torch)
#     _, predictions = torch.max(output_probs.data, 1)
#     predictions = predictions.cpu().numpy()

# # Convert the predicted labels back to their string representations
# predicted_labels = [label_map[pred] for pred in predictions]

# # Add the predicted labels to the original DataFrame and save it to a new CSV file
# df['predicted_sentiment'] = predicted_labels
# df.to_csv('predictions5.csv', index=False)


CPU Implementation of CNN+LSTM

In [None]:
# import numpy as np
# from keras.layers import Embedding, Bidirectional, LSTM, Dense
# from keras.models import Sequential
# from keras.callbacks import EarlyStopping, ModelCheckpoint
# from keras.optimizers import Adam

# # Load GloVe embeddings
# def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
#     embeddings_index = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
    
#     embedding_matrix = np.zeros((max_features, embedding_dim))
#     for word, i in word_index.items():
#         if i >= max_features:
#             continue
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector
#     return embedding_matrix

# # Load the GloVe embeddings matrix
# glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
# embedding_dim = 200
# embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

# from keras.layers import Conv1D, MaxPooling1D

# def create_cnn_lstm_model(embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.2, optimizer='adam', num_classes=3, num_filters=64, filter_size=5, pool_size=2):
#     model = Sequential()
#     model.add(Embedding(max_features, embed_dim, weights=[embedding_matrix], input_length=X.shape[1], trainable=False))
#     model.add(Conv1D(num_filters, filter_size, activation='relu'))
#     model.add(MaxPooling1D(pool_size=pool_size))
#     model.add(Bidirectional(LSTM(lstm_out, dropout=dropout_rate, recurrent_dropout=dropout_rate)))
#     model.add(Dense(num_classes, activation='softmax'))
#     model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
#     return model

# model = create_cnn_lstm_model(embedding_matrix)

# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# model_checkpoint = ModelCheckpoint('best_cnn_lstm_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# history = model.fit(X_train, y_train, epochs=5, batch_size=500, validation_split=0.1, callbacks=[early_stopping, model_checkpoint])

# # Evaluate the model on the test set
# loss, accuracy = model.evaluate(X_test, y_test)
# print("Test accuracy:", accuracy)

# # Predict sentiment labels for the test data
# y_pred_probs = model.predict(X_test)
# y_pred = np.argmax(y_pred_probs, axis=1)

# # Calculate the accuracy of the predictions
# correct_predictions = np.sum(y_pred == y_test)
# total_predictions = len(y_test)
# prediction_accuracy = correct_predictions / total_predictions

# print(f"Prediction accuracy: {prediction_accuracy:.4f}")
