In [1]:
import pandas as pd
import numpy as np 
import matplotlib as plt 

tweets = pd.read_csv('combined.csv', encoding='utf-8')
tweets.head(5)

Unnamed: 0,tweet_date_created,tweet_id,tweet_text,language,sentiment,sentiment_score
0,2018-05-08T08:19:09,993767246437666816,Bayer Leverkusen goalkeeper Bernd Leno will no...,en,NEUTRAL,"{""Neutral"":0.7228581905364990234375,""Negative""..."
1,2018-07-02T19:28:00.331000,1013866900772835331,Gary Speed v Blackburn at St James in 2001/02 ...,en,NEUTRAL,"{""Neutral"":0.998256266117095947265625,""Negativ..."
2,2018-09-05T12:54:20.408000,1037323043360657408,@ChelseaFC Don't make him regret it and start ...,en,NEUTRAL,"{""Neutral"":0.912796199321746826171875,""Negativ..."
3,2018-05-08T10:42:17,993803266323550208,"@LiverpoolFF @AnfieldEdition He's a liar, made...",en,NEGATIVE,"{""Neutral"":0.3271420896053314208984375,""Negati..."
4,2018-08-07T07:29:59.136000,1026732168226267136,@theesk @Everton Didn't realise Kenwright is d...,en,NEUTRAL,"{""Neutral"":0.957906246185302734375,""Negative"":..."


Deleting irrelevant columns

In [2]:
tweets = tweets.drop(['tweet_date_created'], axis=1)
tweets = tweets.drop(['sentiment_score'], axis=1)

tweets.head(5)

Unnamed: 0,tweet_id,tweet_text,language,sentiment
0,993767246437666816,Bayer Leverkusen goalkeeper Bernd Leno will no...,en,NEUTRAL
1,1013866900772835331,Gary Speed v Blackburn at St James in 2001/02 ...,en,NEUTRAL
2,1037323043360657408,@ChelseaFC Don't make him regret it and start ...,en,NEUTRAL
3,993803266323550208,"@LiverpoolFF @AnfieldEdition He's a liar, made...",en,NEGATIVE
4,1026732168226267136,@theesk @Everton Didn't realise Kenwright is d...,en,NEUTRAL


Checking for duplicates and deleting them

In [3]:
duplicates = tweets[tweets.duplicated(subset=['tweet_id'], keep=False)]

if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate tweet ids. Removing duplicates...")
    tweets.drop_duplicates(subset=['tweet_id'], inplace=True)
else:
    print("No duplicate tweet ids found.")

Found 762643 duplicate tweet ids. Removing duplicates...


In [4]:
tweets = tweets.drop(['tweet_id'], axis=1)

Deleting NULLS

In [5]:
tweets = tweets.dropna()
print(tweets.isnull().sum())

tweet_text    0
language      0
sentiment     0
dtype: int64


Language Check

In [6]:
all_english = (tweets['language'] == 'en').all()

if all_english:
    print("All values in the 'language' column are 'en'")
else:
    print("Not all values in the 'language' column are 'en'")

All values in the 'language' column are 'en'


In [7]:
tweets = tweets.drop(['language'], axis=1)

Sentiment Map 

In [8]:
sentiment_map = {"NEGATIVE": 0, "POSITIVE": 1, "NEUTRAL": 2, "MIXED": 3}

# Map the sentiment labels to their numeric values
tweets['sentiment_values'] = tweets['sentiment'].map(sentiment_map)

# Print the new column that contains the mapped values
print(tweets['sentiment_values'])
tweets.head(5)

0          2
1          2
2          2
3          0
4          2
          ..
5393957    2
5393958    0
5393959    2
5393960    2
5393961    2
Name: sentiment_values, Length: 5012534, dtype: int64


Unnamed: 0,tweet_text,sentiment,sentiment_values
0,Bayer Leverkusen goalkeeper Bernd Leno will no...,NEUTRAL,2
1,Gary Speed v Blackburn at St James in 2001/02 ...,NEUTRAL,2
2,@ChelseaFC Don't make him regret it and start ...,NEUTRAL,2
3,"@LiverpoolFF @AnfieldEdition He's a liar, made...",NEGATIVE,0
4,@theesk @Everton Didn't realise Kenwright is d...,NEUTRAL,2


In [9]:
# Find the indices of rows with Mixed sentiment
mixed_indices = tweets[tweets['sentiment_values'] == 3].index
tweets = tweets.drop(mixed_indices)

In [10]:
negative_tweets = tweets[tweets['sentiment_values'] == 0]
positive_tweets = tweets[tweets['sentiment_values'] == 1]
neutral_tweets = tweets[tweets['sentiment_values'] == 2]

print('No of positive tagged tweets is: {}'.format(len(positive_tweets)))
print('No of negative tagged tweets is: {}'.format(len(negative_tweets)))
print('No of neutral tagged tweets is: {}'.format(len(neutral_tweets)))

No of positive tagged tweets is: 1070334
No of negative tagged tweets is: 354501
No of neutral tagged tweets is: 3549918


In [11]:
import pandas as pd

# Shuffle the DataFrame to ensure that the downsampling is random
tweets = tweets.sample(frac=1, random_state=42)

# Count the number of tweets in each sentiment class
counts = tweets['sentiment_values'].value_counts()

# Find the smallest class size
smallest_size = counts.min()

# Downsample each class to the smallest size
positive_tweets = tweets[tweets['sentiment_values'] == 1].sample(n=smallest_size, random_state=42)
negative_tweets = tweets[tweets['sentiment_values'] == 0].sample(n=smallest_size, random_state=42)
neutral_tweets = tweets[tweets['sentiment_values'] == 2].sample(n=smallest_size, random_state=42)

# Concatenate the downsampled DataFrames
tweets = pd.concat([positive_tweets, negative_tweets, neutral_tweets], ignore_index=True)

# Print the new counts of tweets in each class
print('No of positive tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 1])))
print('No of negative tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 0])))
print('No of neutral tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 2])))


No of positive tagged tweets is: 354501
No of negative tagged tweets is: 354501
No of neutral tagged tweets is: 354501


In [12]:
tweets['tweet_text'] = tweets['tweet_text'].astype('str')

In [13]:
import warnings
warnings.filterwarnings('ignore')
import re
import string
import pickle
import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopword = set(stopwords.words('english'))
urlPattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
userPattern = '@[^\s]+'
some = 'amp|today|tomorrow|going|girl'

def process_tweets(tweet):
  # Lower Casing
    tweet = re.sub(r"he's", "he is", tweet)
    tweet = re.sub(r"there's", "there is", tweet)
    tweet = re.sub(r"We're", "We are", tweet)
    tweet = re.sub(r"That's", "That is", tweet)
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"they're", "they are", tweet)
    tweet = re.sub(r"Can't", "Cannot", tweet)
    tweet = re.sub(r"wasn't", "was not", tweet)
    tweet = re.sub(r"don\x89Ûªt", "do not", tweet)
    tweet = re.sub(r"aren't", "are not", tweet)
    tweet = re.sub(r"isn't", "is not", tweet)
    tweet = re.sub(r"What's", "What is", tweet)
    tweet = re.sub(r"haven't", "have not", tweet)
    tweet = re.sub(r"hasn't", "has not", tweet)
    tweet = re.sub(r"There's", "There is", tweet)
    tweet = re.sub(r"He's", "He is", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"You're", "You are", tweet)
    tweet = re.sub(r"I'M", "I am", tweet)
    tweet = re.sub(r"shouldn't", "should not", tweet)
    tweet = re.sub(r"wouldn't", "would not", tweet)
    tweet = re.sub(r"i'm", "I am", tweet)
    tweet = re.sub(r"I\x89Ûªm", "I am", tweet)
    tweet = re.sub(r"I'm", "I am", tweet)
    tweet = re.sub(r"Isn't", "is not", tweet)
    tweet = re.sub(r"Here's", "here is", tweet)
    tweet = re.sub(r"you've", "you have", tweet)
    tweet = re.sub(r"you\x89Ûªve", "you have", tweet)
    tweet = re.sub(r"we're", "we are", tweet)
    tweet = re.sub(r"what's", "what is", tweet)
    tweet = re.sub(r"couldn't", "could not", tweet)
    tweet = re.sub(r"we've", "we have", tweet)
    tweet = re.sub(r"it\x89Ûªs", "it is", tweet)
    tweet = re.sub(r"doesn\x89Ûªt", "does not", tweet)
    tweet = re.sub(r"It\x89Ûªs", "It is", tweet)
    tweet = re.sub(r"Here\x89Ûªs", "here is", tweet)
    tweet = re.sub(r"who's", "who is", tweet)
    tweet = re.sub(r"I\x89Ûªve", "I have", tweet)
    tweet = re.sub(r"y'all", "you all", tweet)
    tweet = re.sub(r"can\x89Ûªt", "cannot", tweet)
    tweet = re.sub(r"would've", "would have", tweet)
    tweet = re.sub(r"it'll", "it will", tweet)
    tweet = re.sub(r"we'll", "we will", tweet)
    tweet = re.sub(r"wouldn\x89Ûªt", "would not", tweet)
    tweet = re.sub(r"We've", "We have", tweet)
    tweet = re.sub(r"he'll", "he will", tweet)
    tweet = re.sub(r"Y'all", "You all", tweet)
    tweet = re.sub(r"Weren't", "Were not", tweet)
    tweet = re.sub(r"Didn't", "Did not", tweet)
    tweet = re.sub(r"they'll", "they will", tweet)
    tweet = re.sub(r"they'd", "they would", tweet)
    tweet = re.sub(r"DON'T", "DO NOT", tweet)
    tweet = re.sub(r"That\x89Ûªs", "That is", tweet)
    tweet = re.sub(r"they've", "they have", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"should've", "should have", tweet)
    tweet = re.sub(r"You\x89Ûªre", "You are", tweet)
    tweet = re.sub(r"where's", "where is", tweet)
    tweet = re.sub(r"Don\x89Ûªt", "Do not", tweet)
    tweet = re.sub(r"we'd", "we would", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"weren't", "were not", tweet)
    tweet = re.sub(r"They're", "They are", tweet)
    tweet = re.sub(r"Can\x89Ûªt", "Cannot", tweet)
    tweet = re.sub(r"you\x89Ûªll", "you will", tweet)
    tweet = re.sub(r"I\x89Ûªd", "I would", tweet)
    tweet = re.sub(r"let's", "let us", tweet)
    tweet = re.sub(r"it's", "it is", tweet)
    tweet = re.sub(r"can't", "cannot", tweet)
    tweet = re.sub(r"don't", "do not", tweet)
    tweet = re.sub(r"you're", "you are", tweet)
    tweet = re.sub(r"i've", "I have", tweet)
    tweet = re.sub(r"that's", "that is", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"doesn't", "does not", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"didn't", "did not", tweet)
    tweet = re.sub(r"ain't", "am not", tweet)
    tweet = re.sub(r"you'll", "you will", tweet)
    tweet = re.sub(r"I've", "I have", tweet)
    tweet = re.sub(r"Don't", "do not", tweet)
    tweet = re.sub(r"I'll", "I will", tweet)
    tweet = re.sub(r"I'd", "I would", tweet)
    tweet = re.sub(r"Let's", "Let us", tweet)
    tweet = re.sub(r"you'd", "You would", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"Ain't", "am not", tweet)
    tweet = re.sub(r"Haven't", "Have not", tweet)
    tweet = re.sub(r"Could've", "Could have", tweet)
    tweet = re.sub(r"youve", "you have", tweet)  
    tweet = re.sub(r"donå«t", "do not", tweet)  
    
    tweet = re.sub(r"some1", "someone", tweet)
    tweet = re.sub(r"yrs", "years", tweet)
    tweet = re.sub(r"hrs", "hours", tweet)
    tweet = re.sub(r"2morow|2moro", "tomorrow", tweet)
    tweet = re.sub(r"2day", "today", tweet)
    tweet = re.sub(r"4got|4gotten", "forget", tweet)
    tweet = re.sub(r"b-day|bday", "b-day", tweet)
    tweet = re.sub(r"mother's", "mother", tweet)
    tweet = re.sub(r"mom's", "mom", tweet)
    tweet = re.sub(r"dad's", "dad", tweet)
    tweet = re.sub(r"hahah|hahaha|hahahaha", "haha", tweet)
    tweet = re.sub(r"lmao|lolz|rofl", "lol", tweet)
    tweet = re.sub(r"thanx|thnx", "thanks", tweet)
    tweet = re.sub(r"goood", "good", tweet)
    tweet = re.sub(r"some1", "someone", tweet)
    tweet = re.sub(r"some1", "someone", tweet)
    tweet = tweet.lower()
    tweet=tweet[1:]

    # Removing all URls 
    tweet = re.sub(urlPattern,'',tweet)
    # Removing all @username.
    tweet = re.sub(userPattern,'', tweet) 
    #remove some words
    tweet= re.sub(some,'',tweet)
    #Remove punctuations
    tweet = tweet.translate(str.maketrans("","",string.punctuation))
    #tokenizing words
    tokens = word_tokenize(tweet)
    #tokens = [w for w in tokens if len(w)>2]
    #Removing Stop Words
    final_tokens = [w for w in tokens if w not in stopword]
    #reducing a word to its word stem 
    wordLemm = WordNetLemmatizer()
    finalwords=[]
    for w in final_tokens:
      if len(w)>1:
        word = wordLemm.lemmatize(w)
        finalwords.append(word)
    return ' '.join(finalwords)

In [14]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", 
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
     "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [15]:
def convert_abbrev_in_text(tweets):
    t = []
    words = tweets.split() 
    t = [abbreviations[w.lower()] if w.lower() in abbreviations.keys() else w for w in words]

In [16]:
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove whitespace
    text = text.strip()
    return text

In [17]:
import pandas as pd
from sklearn.utils import shuffle

# Apply the preprocessing function to the 'text' column
tweets['processed_text'] = tweets['tweet_text'].apply(preprocess_text)

tweets =  shuffle(tweets).reset_index(drop=True)

In [18]:
tweets.head(5)

Unnamed: 0,tweet_text,sentiment,sentiment_values,processed_text
0,Smith-Rowe: \n\nTo be honest it feels great f...,POSITIVE,1,smithrowe \n\nto be honest it feels great for ...
1,The new king himself don't go back to Madrid #...,NEUTRAL,2,the new king himself dont go back to madrid
2,#Bellerin is not an interesting player for #Ju...,NEGATIVE,0,is not an interesting player for and new r...
3,@HarryMaguire93 you were my MOM last night. In...,POSITIVE,1,you were my mom last night inspiring performan...
4,@ManUtd So this has arrived for my son as usua...,NEGATIVE,0,so this has arrived for my son as usual i see ...


In [19]:
tokenized_tweet=tweets['processed_text'].apply(lambda x: x.split())
tokenized_tweet.head(5)

0    [smithrowe, to, be, honest, it, feels, great, ...
1    [the, new, king, himself, dont, go, back, to, ...
2    [is, not, an, interesting, player, for, and, n...
3    [you, were, my, mom, last, night, inspiring, p...
4    [so, this, has, arrived, for, my, son, as, usu...
Name: processed_text, dtype: object

In [20]:
X = tweets['processed_text']
y = tweets['sentiment_values']

In [21]:
import pandas as pd

total_rows = tweets.shape[0]
print("Total number of rows:", total_rows)

Total number of rows: 1063503


In [22]:
import numpy as np

tweet_lengths = tweets['processed_text'].apply(lambda x: len(x.split()))
mean_length = np.mean(tweet_lengths)
std_length = np.std(tweet_lengths)

print("Average tweet length:", mean_length)
print("Standard deviation of tweet length:", std_length)

Average tweet length: 19.881952378131516
Standard deviation of tweet length: 12.338064194346824


In [23]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV


In [24]:
max_features = 30000
max_length = 100

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)
X = pad_sequences(X, maxlen=max_length)

In [25]:
# from sklearn.model_selection import train_test_split

# X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [26]:
from sklearn.model_selection import train_test_split

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=(15/70), random_state=42)

print("Training data size:", len(X_train))
print("Validation data size:", len(X_val))
print("Testing data size:", len(X_test))

Training data size: 584926
Validation data size: 159526
Testing data size: 319051


In [27]:
import numpy as np
from sklearn.model_selection import train_test_split

# Split the dataset into 70% training and 30% combined validation and testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary dataset (30% of the entire dataset) into 50% validation and 50% testing
# This results in 15% validation and 15% testing of the entire dataset
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training data size:", len(X_train))
print("Validation data size:", len(X_val))
print("Testing data size:", len(X_test))


Training data size: 744452
Validation data size: 159525
Testing data size: 159526


GPU with LSTM Implementation

In [28]:
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset, TensorDataset
# from sklearn.metrics import accuracy_score

# # Load GloVe embeddings
# def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
#     embeddings_index = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
    
#     embedding_matrix = np.zeros((max_features, embedding_dim))
#     for word, i in word_index.items():
#         if i >= max_features:
#             continue
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector
#     return torch.FloatTensor(embedding_matrix)

# # Load the GloVe embeddings matrix
# glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
# embedding_dim = 200
# embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

# # Define the LSTM model with GloVe embeddings
# class LSTMModel(nn.Module):
#     def __init__(self, embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.2, num_classes=3):
#         super().__init__()
#         self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
#         self.lstm = nn.LSTM(embed_dim, lstm_out, bidirectional=True, batch_first=True, dropout=dropout_rate)
#         self.fc = nn.Linear(lstm_out * 2, num_classes)
#         self.dropout = nn.Dropout(dropout_rate)
        
#     def forward(self, x):
#         x = self.embedding(x)
#         x, _ = self.lstm(x)
#         x = self.dropout(x[:, -1, :])  # Get the last hidden state of the LSTM
#         x = self.fc(x)
#         return x

# # Create and train the model with GloVe embeddings
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = LSTMModel(embedding_matrix).to(device)

# X_train_torch = torch.LongTensor(X_train).to(device)
# y_train_torch = y_train.to(torch.int64).to(device)  # Change data type to int64
# X_test_torch = torch.LongTensor(X_test).to(device)
# y_test_torch = y_test.to(torch.int64).to(device)  # Change data type to int64

# train_data = TensorDataset(X_train_torch, y_train_torch)
# train_dataloader = DataLoader(train_data, batch_size=128, shuffle=True)

# optimizer = optim.Adam(model.parameters())
# criterion = nn.CrossEntropyLoss()

# num_epochs = 5
# accumulation_steps = 4  # Adjust this value based on your needs
# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     for i, (batch_X, batch_y) in enumerate(train_dataloader):
#         optimizer.zero_grad()
#         outputs = model(batch_X)
#         loss = criterion(outputs, batch_y) / accumulation_steps  # Normalize the loss
#         loss.backward()
        
#         # Accumulate gradients and update weights every accumulation_steps
#         if (i + 1) % accumulation_steps == 0:
#             optimizer.step()
#             optimizer.zero_grad()

#         running_loss += loss.item() * accumulation_steps 
#     print(f"Epoch {epoch+1}/{num_epochs}, Step [{i + 1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")


# # Create DataLoader for the test set
# test_data = TensorDataset(X_test_torch, y_test_torch)
# test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)

# # Evaluate the model on the test set
# model.eval()
# y_pred = []
# y_true = []

# with torch.no_grad():
#     for batch_X, batch_y in test_dataloader:
#         test_outputs = model(batch_X)
#         _, batch_pred = torch.max(test_outputs, 1)
#         y_pred.extend(batch_pred.cpu().numpy())
#         y_true.extend(batch_y.cpu().numpy())

# test_accuracy = accuracy_score(y_true, y_pred)
# print("Test accuracy:", test_accuracy)

Specified Hyper-parameters Implementation

Hyper-parameters Implementation

In [34]:
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset, TensorDataset
# from sklearn.metrics import accuracy_score
# import optuna

# # Load GloVe embeddings
# def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
#     embeddings_index = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs

#     embedding_matrix = np.zeros((max_features, embedding_dim))
#     for word, i in word_index.items():
#         if i >= max_features:
#             continue
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector
#     return torch.tensor(embedding_matrix, dtype=torch.float32)

# # Load the GloVe embeddings matrix
# glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
# embedding_dim = 200
# embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

# class CNNTLSTM(nn.Module):
#     def __init__(self, embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.4, num_classes=3, num_filters=64, filter_size=5, pool_size=2):
#         super(CNNTLSTM, self).__init__()
#         self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
#         self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
#         self.relu = nn.ReLU()
#         self.max_pool1d = nn.MaxPool1d(pool_size)
#         self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
#         self.dropout = nn.Dropout(dropout_rate)
#         self.fc = nn.Linear(lstm_out, num_classes)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, x):
#         x = self.embedding(x)
#         x = x.permute(0, 2, 1)
#         x = self.conv1d(x)
#         x = self.relu(x)
#         x = self.max_pool1d(x)
#         x = x.permute(0, 2, 1)
#         x, _ = self.bi_lstm(x)
#         x = x[:, -1, :]
#         x = self.dropout(x)
#         x = self.fc(x)
#         return self.softmax(x)

# X_train_torch = torch.tensor(X_train, dtype=torch.long)
# y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
# X_val_torch = torch.tensor(X_val, dtype=torch.long)
# y_val_torch = torch.tensor(y_val.values, dtype=torch.long)
# X_test_torch = torch.tensor(X_test, dtype=torch.long)
# y_test_torch = torch.tensor(y_test.values, dtype=torch.long)

# y_train_np = y_train.values.reshape(-1, 1)
# y_val_np = y_val.values.reshape(-1, 1)
# y_test_np = y_test.values.reshape(-1, 1)

# hyperparameters_accuracies = []

# def objective(trial):
#     # Hyperparameters to be optimized
#     num_filters = int(trial.suggest_discrete_uniform("num_filters", 32, 256, 32))
#     filter_size = int(trial.suggest_discrete_uniform("filter_size", 3, 7, 2))
#     pool_size = int(trial.suggest_discrete_uniform("pool_size", 2, 4, 2))
#     lstm_out = int(trial.suggest_discrete_uniform("lstm_out", 64, 512, 64))
#     dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)
#     learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)

#     num_epochs = 5
#     batch_size = 128

#     train_dataset = TensorDataset(X_train_torch, y_train_torch)
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#     model = CNNTLSTM(embedding_matrix, embed_dim=embedding_dim, lstm_out=lstm_out, dropout_rate=dropout_rate, num_classes=3, num_filters=num_filters, filter_size=filter_size, pool_size=pool_size).cuda()

#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#         # Train the model with the given hyperparameters
#     for epoch in range(num_epochs):
#         for _, (texts, labels) in enumerate(train_loader):
#             texts, labels = texts.cuda(), labels.cuda()

#             # Forward pass
#             outputs = model(texts)
#             loss = criterion(outputs, labels)

#             # Backward and optimize
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#         # Evaluate the model on the validation set
#         model.eval()
#         val_dataset = TensorDataset(X_val_torch, y_val_torch)
#         val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

#         all_val_predictions = []
#         all_val_labels = []

#         with torch.no_grad():
#             for texts, labels in val_loader:
#                 texts, labels = texts.cuda(), labels.cuda()
#                 val_outputs = model(texts)
#                 _, predicted = torch.max(val_outputs.data, 1)
#                 all_val_predictions.extend(predicted.cpu().numpy())
#                 all_val_labels.extend(labels.cpu().numpy())

#         # Calculate the validation accuracy
#         val_accuracy = accuracy_score(y_val_np, all_val_predictions)

#         # Print the accuracy for each set of hyperparameters
#         print(f"Validation accuracy at epoch {epoch + 1}: {val_accuracy:.4f} with hyperparameters: {trial.params}")

#         # Append the accuracy and hyperparameters to the list
#         hyperparameters_accuracies.append((val_accuracy, trial.params))

#         return val_accuracy

# # Optimize using Optuna
# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=50)

# # Print the best hyperparameters
# print("Best trial:")
# trial = study.best_trial
# print("  Value: ", trial.value)
# print("  Params: ")
# for key, value in trial.params.items():
#     print(f"    {key}: {value}")

[32m[I 2023-04-12 20:59:35,251][0m A new study created in memory with name: no-name-65df27b8-807e-4a86-a5ed-1d9ad644d4e3[0m
[32m[I 2023-04-12 21:00:11,535][0m Trial 0 finished with value: 0.805447422034164 and parameters: {'num_filters': 32.0, 'filter_size': 3.0, 'pool_size': 2.0, 'lstm_out': 192.0, 'dropout_rate': 0.30000000000000004, 'learning_rate': 0.0009515742023502816}. Best is trial 0 with value: 0.805447422034164.[0m


Validation accuracy at epoch 1: 0.8054 with hyperparameters: {'num_filters': 32.0, 'filter_size': 3.0, 'pool_size': 2.0, 'lstm_out': 192.0, 'dropout_rate': 0.30000000000000004, 'learning_rate': 0.0009515742023502816}


[33m[W 2023-04-12 21:00:13,225][0m Trial 1 failed with parameters: {'num_filters': 64.0, 'filter_size': 7.0, 'pool_size': 4.0, 'lstm_out': 448.0, 'dropout_rate': 0.2, 'learning_rate': 2.4335279061898935e-05} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\clayt\AppData\Local\Temp\ipykernel_18884\1620370818.py", line 97, in objective
    outputs = model(texts)
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\torch\nn\modules\module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "C:\Users\clayt\AppData\Local\Temp\ipykernel_18884\1620370818.py", line 52, in forward
    x, _ = self.bi_lstm(x)
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\torch\nn\modules\module.py", line 1501, in _call_impl
    return forward_call(

KeyboardInterrupt: 

In [36]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import optuna

# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Load the GloVe embeddings matrix
glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
embedding_dim = 200
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

class CNNTLSTM(nn.Module):
    def __init__(self, embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.4, num_classes=3, num_filters=64, filter_size=5, pool_size=2):
        super(CNNTLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
        self.relu = nn.ReLU()
        self.max_pool1d = nn.MaxPool1d(pool_size)
        self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(lstm_out, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.max_pool1d(x)
        x = x.permute(0, 2, 1)
        x, _ = self.bi_lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return self.softmax(x)

X_train_torch = torch.tensor(X_train, dtype=torch.long)
y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
X_val_torch = torch.tensor(X_val, dtype=torch.long)
y_val_torch = torch.tensor(y_val.values, dtype=torch.long)
X_test_torch = torch.tensor(X_test, dtype=torch.long)
y_test_torch = torch.tensor(y_test.values, dtype=torch.long)

y_train_np = y_train.values.reshape(-1, 1)
y_val_np = y_val.values.reshape(-1, 1)  # Added this line
y_test_np = y_test.values.reshape(-1, 1)


hyperparameters_accuracies = []

def objective(trial):
    # Hyperparameters to be optimized
    num_filters = int(trial.suggest_discrete_uniform("num_filters", 32, 256, 32))
    filter_size = int(trial.suggest_discrete_uniform("filter_size", 3, 7, 2))
    pool_size = int(trial.suggest_discrete_uniform("pool_size", 2, 4, 2))
    lstm_out = int(trial.suggest_discrete_uniform("lstm_out", 64, 512, 64))
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)

    model = CNNTLSTM(embedding_matrix, embed_dim=embedding_dim, lstm_out=lstm_out, dropout_rate=dropout_rate, num_classes=3, num_filters=num_filters, filter_size=filter_size, pool_size=pool_size).cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # Train the model with the given hyperparameters
    for epoch in range(num_epochs):
        for i, (texts, labels) in enumerate(train_loader):
            texts, labels = texts.cuda(), labels.cuda()

            # Forward pass
            outputs = model(texts)
            loss = criterion(outputs, labels) / accumulation_steps

            # Backward and optimize
            loss.backward()
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

    # Evaluate the model on the validation set
    model.eval()
    val_dataset = TensorDataset(X_val_torch, y_val_torch)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    all_val_predictions = []
    all_val_labels = []

    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.cuda(), labels.cuda()
            val_outputs = model(texts)
            _, predicted = torch.max(val_outputs.data, 1)
            all_val_predictions.extend(predicted.cpu().numpy())
            all_val_labels.extend(labels.cpu().numpy())

    # Calculate the validation accuracy
    val_accuracy = accuracy_score(y_val_np, all_val_predictions)

    # Print the accuracy for each set of hyperparameters
    print(f"Validation accuracy: {val_accuracy:.4f} with hyperparameters: {trial.params}")

    # Append the accuracy and hyperparameters to the list
    hyperparameters_accuracies.append((val_accuracy, trial.params))

    return val_accuracy

    # Optimize using Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

    # Print the best hyperparameters
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
# Train the final model using the best hyperparameters
best_params = study.best_params
best_lstm_out = int(best_params["lstm_out"])
best_dropout_rate = best_params["dropout_rate"]
best_num_filters = int(best_params["num_filters"])
best_filter_size = best_params["filter_size"]
best_pool_size = best_params["pool_size"]
best_learning_rate = best_params["learning_rate"]

num_epochs = 5
batch_size = 128

final_model = CNNTLSTM(embedding_matrix, lstm_out=best_lstm_out, dropout_rate=best_dropout_rate, num_filters=best_num_filters, filter_size=best_filter_size, pool_size=best_pool_size).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(final_model.parameters(), lr=best_learning_rate)

train_dataset = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

accumulation_steps = 4  # Adjust this value based on your GPU memory capacity

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (texts, labels) in enumerate(train_loader):
        texts, labels = texts.cuda(), labels.cuda()

        # Forward pass
        outputs = final_model(texts)
        loss = criterion(outputs, labels) / accumulation_steps

        # Backward and optimize
        loss.backward()
        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        running_loss += loss.item() * accumulation_steps

# Evaluate the final model on the test set
final_model.eval()
test_dataset = TensorDataset(X_test_torch, y_test_torch)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)  # Set a smaller batch size for test data

total_correct = 0
total_samples = 0

all_predictions = []
all_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.cuda(), labels.cuda()
        test_outputs = final_model(texts)
        _, predicted = torch.max(test_outputs.data, 1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)

test_accuracy = total_correct / total_samples
print(f"Test accuracy: {test_accuracy:.4f}")



[32m[I 2023-04-12 21:04:05,071][0m A new study created in memory with name: no-name-590ee9aa-d543-44c5-9326-4d50e43d9f75[0m
[33m[W 2023-04-12 21:04:05,111][0m Trial 0 failed with parameters: {'num_filters': 224.0, 'filter_size': 5.0, 'pool_size': 4.0, 'lstm_out': 128.0, 'dropout_rate': 0.5} because of the following error: NameError("name 'num_epochs' is not defined").[0m
Traceback (most recent call last):
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\clayt\AppData\Local\Temp\ipykernel_18884\1915710152.py", line 88, in objective
    for epoch in range(num_epochs):
NameError: name 'num_epochs' is not defined
[33m[W 2023-04-12 21:04:05,113][0m Trial 0 failed with value None.[0m


NameError: name 'num_epochs' is not defined

In [39]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.autograd import Variable
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
import optuna

# Load GloVe embeddings
def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    
    embedding_matrix = np.zeros((max_features, embedding_dim))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return torch.tensor(embedding_matrix, dtype=torch.float32)

# Load the GloVe embeddings matrix
glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
embedding_dim = 200
embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

class CNNTLSTM(nn.Module):
    def __init__(self, embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.4, num_classes=3, num_filters=64, filter_size=5, pool_size=2):
        super(CNNTLSTM, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
        self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
        self.relu = nn.ReLU()
        self.max_pool1d = nn.MaxPool1d(pool_size)
        self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(lstm_out, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.max_pool1d(x)
        x = x.permute(0, 2, 1)
        x, _ = self.bi_lstm(x)
        x = x[:, -1, :]
        x = self.dropout(x)
        x = self.fc(x)
        return self.softmax(x)

X_train_torch = torch.tensor(X_train, dtype=torch.long)
y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
X_val_torch = torch.tensor(X_val, dtype=torch.long)
y_val_torch = torch.tensor(y_val.values, dtype=torch.long)
X_test_torch = torch.tensor(X_test, dtype=torch.long)
y_test_torch = torch.tensor(y_test.values, dtype=torch.long)

y_train_np = y_train.values.reshape(-1, 1)
y_val_np = y_val.values.reshape(-1, 1)  # Added this line
y_test_np = y_test.values.reshape(-1, 1)

# Define batch_size and accumulation_steps
batch_size = 32
accumulation_steps = 4

# Create DataLoaders
train_dataset = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

hyperparameters_accuracies = []

def objective(trial):
    # Hyperparameters to be optimized
    num_filters = int(trial.suggest_discrete_uniform("num_filters", 32, 256, 32))
    filter_size = int(trial.suggest_discrete_uniform("filter_size", 3, 7, 2))
    pool_size = int(trial.suggest_discrete_uniform("pool_size", 2, 4, 2))
    lstm_out = int(trial.suggest_discrete_uniform("lstm_out", 64, 512, 64))
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5, step=0.1)

    model = CNNTLSTM(embedding_matrix, embed_dim=embedding_dim, lstm_out=lstm_out, dropout_rate=dropout_rate, num_classes=3, num_filters=num_filters, filter_size=filter_size, pool_size=pool_size).cuda()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    num_epochs = 5  # Define num_epochs here

    # Train the model with the given hyperparameters
    for epoch in range(num_epochs):
        for i, (texts, labels) in enumerate(train_loader):
            texts, labels = texts.cuda(), labels.cuda()

            # Forward pass
            outputs = model(texts)
            loss = criterion(outputs, labels) / accumulation_steps

            # Backward and optimize
            loss.backward()
            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

    # Evaluate the model on the validation set
    model.eval()
    val_dataset = TensorDataset(X_val_torch, y_val_torch)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    all_val_predictions = []
    all_val_labels = []

    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.cuda(), labels.cuda()
            val_outputs = model(texts)
            _, predicted = torch.max(val_outputs.data, 1)
            all_val_predictions.extend(predicted.cpu().numpy())
            all_val_labels.extend(labels.cpu().numpy())

    # Calculate the validation accuracy
    val_accuracy = accuracy_score(y_val_np, all_val_predictions)

    # Print the accuracy for each set of hyperparameters
    print(f"Validation accuracy: {val_accuracy:.4f} with hyperparameters: {trial.params}")

    # Append the accuracy and hyperparameters to the list
    hyperparameters_accuracies.append((val_accuracy, trial.params))

    return val_accuracy

# Optimize using Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Print the best hyperparameters
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[32m[I 2023-04-12 21:19:59,383][0m A new study created in memory with name: no-name-ba8c3993-9e54-4680-8445-145858e6261f[0m
[32m[I 2023-04-12 21:25:38,250][0m Trial 0 finished with value: 0.8180159849553361 and parameters: {'num_filters': 128.0, 'filter_size': 7.0, 'pool_size': 2.0, 'lstm_out': 64.0, 'dropout_rate': 0.1}. Best is trial 0 with value: 0.8180159849553361.[0m


Validation accuracy: 0.8180 with hyperparameters: {'num_filters': 128.0, 'filter_size': 7.0, 'pool_size': 2.0, 'lstm_out': 64.0, 'dropout_rate': 0.1}


[33m[W 2023-04-12 21:31:13,362][0m Trial 1 failed with parameters: {'num_filters': 160.0, 'filter_size': 7.0, 'pool_size': 2.0, 'lstm_out': 64.0, 'dropout_rate': 0.2} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\clayt\AppData\Local\Temp\ipykernel_18884\3863011002.py", line 98, in objective
    for i, (texts, labels) in enumerate(train_loader):
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\torch\utils\data\dataloader.py", line 634, in __next__
    data = self._next_data()
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\torch\utils\data\dataloader.py", line 678, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "c:\Users\clayt\anaconda3\envs\gpu-ienv\lib\site-packages\torch\utils\data\_utils\fetc

KeyboardInterrupt: 

GPU with CNN+LSTM Implementation

In [None]:
# import numpy as np
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, Dataset, TensorDataset
# from torch.autograd import Variable
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import classification_report, confusion_matrix

# # Load GloVe embeddings
# def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
#     embeddings_index = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
    
#     embedding_matrix = np.zeros((max_features, embedding_dim))
#     for word, i in word_index.items():
#         if i >= max_features:
#             continue
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector
#     return torch.tensor(embedding_matrix, dtype=torch.float32)

# # Load the GloVe embeddings matrix
# glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
# embedding_dim = 200
# embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

# class CNNTLSTM(nn.Module):
#     def __init__(self, embedding_matrix, embed_dim=200, lstm_out=448, dropout_rate=0.2, num_classes=3, num_filters=256, filter_size=7, pool_size=2):
#         super(CNNTLSTM, self).__init__()
#         self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=True)
#         self.conv1d = nn.Conv1d(embed_dim, num_filters, filter_size)
#         self.relu = nn.ReLU()
#         self.max_pool1d = nn.MaxPool1d(pool_size)
#         self.bi_lstm = nn.LSTM(num_filters, lstm_out // 2, batch_first=True, bidirectional=True, dropout=dropout_rate)
#         self.dropout = nn.Dropout(dropout_rate)
#         self.fc = nn.Linear(lstm_out, num_classes)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, x):
#         x = self.embedding(x)
#         x = x.permute(0, 2, 1)
#         x = self.conv1d(x)
#         x = self.relu(x)
#         x = self.max_pool1d(x)
#         x = x.permute(0, 2, 1)
#         x, _ = self.bi_lstm(x)
#         x = x[:, -1, :]
#         x = self.dropout(x)
#         x = self.fc(x)
#         return self.softmax(x)
    
# y_train_np = y_train.values.reshape(-1, 1)
# y_test_np = y_test.values.reshape(-1, 1)

# # Prepare data for PyTorch
# X_train_torch = torch.tensor(X_train, dtype=torch.long)
# y_train_torch = torch.tensor(y_train.values, dtype=torch.long)
# X_test_torch = torch.tensor(X_test, dtype=torch.long)
# y_test_torch = torch.tensor(y_test.values, dtype=torch.long)

# # Initialize model, loss function and optimizer
# model = CNNTLSTM(embedding_matrix).cuda()
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters())

# # Train the model
# num_epochs = 10
# batch_size = 128

# train_dataset = TensorDataset(X_train_torch, y_train_torch)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# accumulation_steps = 4  # Adjust this value based on your GPU memory capacity

# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for i, (texts, labels) in enumerate(train_loader):
#         texts, labels = texts.cuda(), labels.cuda()

#         # Forward pass
#         outputs = model(texts)
#         loss = criterion(outputs, labels) / accumulation_steps

#         # Calculate accuracy
#         _, predicted = torch.max(outputs.data, 1)
#         accuracy = (predicted == labels).sum().item() / labels.size(0)

#         # Backward and optimize
#         loss.backward()
#         if (i + 1) % accumulation_steps == 0:
#             optimizer.step()
#             optimizer.zero_grad()

#         running_loss += loss.item() * accumulation_steps

#         if (i + 1) % 10 == 0:
#             print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}, Accuracy: {accuracy:.4f}")

# # Evaluate the model on the test set
# model.eval()
# test_dataset = TensorDataset(X_test_torch, y_test_torch)
# test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)  # Set a smaller batch size for test dat| qZ\ASWQ1

# total_correct = 0 
# total_samples = 0

# all_predictions = []
# all_labels = []

# with torch.no_grad():
#     for texts, labels in test_loader:
#         texts, labels = texts.cuda(), labels.cuda()
#         test_outputs = model(texts)
#         _, predicted = torch.max(test_outputs.data, 1)
#         all_predictions.extend(predicted.cpu().numpy())
#         all_labels.extend(labels.cpu().numpy())
#         total_correct += (predicted == labels).sum().item()
#         total_samples += labels.size(0)

# test_accuracy = total_correct / total_samples
# print(f"Test accuracy: {test_accuracy:.4f}")

Epoch [1/10], Step [10/5817], Loss: 18.2356, Accuracy: 0.4453
Epoch [1/10], Step [20/5817], Loss: 18.2141, Accuracy: 0.3438
Epoch [1/10], Step [30/5817], Loss: 18.1861, Accuracy: 0.2891
Epoch [1/10], Step [40/5817], Loss: 18.1652, Accuracy: 0.3594
Epoch [1/10], Step [50/5817], Loss: 18.1333, Accuracy: 0.4922
Epoch [1/10], Step [60/5817], Loss: 18.1101, Accuracy: 0.5781
Epoch [1/10], Step [70/5817], Loss: 18.0791, Accuracy: 0.5859
Epoch [1/10], Step [80/5817], Loss: 18.0510, Accuracy: 0.6797
Epoch [1/10], Step [90/5817], Loss: 18.0353, Accuracy: 0.6641
Epoch [1/10], Step [100/5817], Loss: 18.0193, Accuracy: 0.6797
Epoch [1/10], Step [110/5817], Loss: 18.0167, Accuracy: 0.6094
Epoch [1/10], Step [120/5817], Loss: 17.9922, Accuracy: 0.7031
Epoch [1/10], Step [130/5817], Loss: 17.9667, Accuracy: 0.7422
Epoch [1/10], Step [140/5817], Loss: 17.9683, Accuracy: 0.7344
Epoch [1/10], Step [150/5817], Loss: 17.9724, Accuracy: 0.6797
Epoch [1/10], Step [160/5817], Loss: 17.9445, Accuracy: 0.7422
E

KeyboardInterrupt: 

In [None]:
print("Classification Report:")
print(classification_report(all_labels, all_predictions))

print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_predictions))

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.86     70907
           1       0.83      0.85      0.84     70828
           2       0.80      0.71      0.75     70966

    accuracy                           0.82    212701
   macro avg       0.82      0.82      0.82    212701
weighted avg       0.82      0.82      0.82    212701

Confusion Matrix:
[[63250  1986  5671]
 [ 3286 60478  7064]
 [10270  9988 50708]]


In [None]:
# Save the trained model
model_save_path = "cnntlstm_model565.pth"  # Choose the path where you want to save the model
torch.save(model.state_dict(), model_save_path)


Testing the model

In [None]:
# import pandas as pd
# import torch
# from torch.utils.data import DataLoader, Dataset, TensorDataset
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Function to load a saved model
# def load_saved_model(model_path):
#     model = CNNTLSTM(embedding_matrix).cuda()
#     model.load_state_dict(torch.load(model_path))
#     model.eval()
#     return model

# # Read the CSV file
# csv_file_path = 'SampleTweets.csv'
# df = pd.read_csv(csv_file_path)

# # Tokenize and pad the preprocessed texts
# tokenized_texts = tokenizer.texts_to_sequences(df['processed_text'])
# padded_texts = pad_sequences(tokenized_texts, maxlen=100)

# # Convert the tokenized and padded texts to PyTorch tensors
# texts_torch = torch.tensor(padded_texts, dtype=torch.long)

# # Load the saved model
# model_path = 'cnntlstm_model3.pth'
# loaded_model = load_saved_model(model_path)

# # Use the model to predict sentiment labels for the text data
# label_map = {0: "NEGATIVE", 1: "POSITIVE", 2: "NEUTRAL"}

# with torch.no_grad():
#     texts_torch = texts_torch.cuda()
#     output_probs = loaded_model(texts_torch)
#     _, predictions = torch.max(output_probs.data, 1)
#     predictions = predictions.cpu().numpy()

# # Convert the predicted labels back to their string representations
# predicted_labels = [label_map[pred] for pred in predictions]

# # Add the predicted labels to the original DataFrame and save it to a new CSV file
# df['predicted_sentiment'] = predicted_labels
# df.to_csv('predictions5.csv', index=False)


CPU Implementation of CNN+LSTM

In [None]:
# import numpy as np
# from keras.layers import Embedding, Bidirectional, LSTM, Dense
# from keras.models import Sequential
# from keras.callbacks import EarlyStopping, ModelCheckpoint
# from keras.optimizers import Adam

# # Load GloVe embeddings
# def load_glove_embeddings(file_path, embedding_dim, word_index, max_features):
#     embeddings_index = {}
#     with open(file_path, 'r', encoding='utf-8') as f:
#         for line in f:
#             values = line.split()
#             word = values[0]
#             coefs = np.asarray(values[1:], dtype='float32')
#             embeddings_index[word] = coefs
    
#     embedding_matrix = np.zeros((max_features, embedding_dim))
#     for word, i in word_index.items():
#         if i >= max_features:
#             continue
#         embedding_vector = embeddings_index.get(word)
#         if embedding_vector is not None:
#             embedding_matrix[i] = embedding_vector
#     return embedding_matrix

# # Load the GloVe embeddings matrix
# glove_file_path = 'glove.twitter.27B.200d.txt'  # Update the path to the downloaded GloVe file
# embedding_dim = 200
# embedding_matrix = load_glove_embeddings(glove_file_path, embedding_dim, tokenizer.word_index, max_features)

# from keras.layers import Conv1D, MaxPooling1D

# def create_cnn_lstm_model(embedding_matrix, embed_dim=200, lstm_out=256, dropout_rate=0.2, optimizer='adam', num_classes=3, num_filters=64, filter_size=5, pool_size=2):
#     model = Sequential()
#     model.add(Embedding(max_features, embed_dim, weights=[embedding_matrix], input_length=X.shape[1], trainable=False))
#     model.add(Conv1D(num_filters, filter_size, activation='relu'))
#     model.add(MaxPooling1D(pool_size=pool_size))
#     model.add(Bidirectional(LSTM(lstm_out, dropout=dropout_rate, recurrent_dropout=dropout_rate)))
#     model.add(Dense(num_classes, activation='softmax'))
#     model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
#     return model

# model = create_cnn_lstm_model(embedding_matrix)

# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# model_checkpoint = ModelCheckpoint('best_cnn_lstm_model.h5', save_best_only=True, monitor='val_loss', mode='min')

# history = model.fit(X_train, y_train, epochs=5, batch_size=500, validation_split=0.1, callbacks=[early_stopping, model_checkpoint])

# # Evaluate the model on the test set
# loss, accuracy = model.evaluate(X_test, y_test)
# print("Test accuracy:", accuracy)

# # Predict sentiment labels for the test data
# y_pred_probs = model.predict(X_test)
# y_pred = np.argmax(y_pred_probs, axis=1)

# # Calculate the accuracy of the predictions
# correct_predictions = np.sum(y_pred == y_test)
# total_predictions = len(y_test)
# prediction_accuracy = correct_predictions / total_predictions

# print(f"Prediction accuracy: {prediction_accuracy:.4f}")
