In [1]:
import pandas as pd
import numpy as np 
import matplotlib as plt 

tweets = pd.read_csv('combined.csv', encoding='utf-8')

In [2]:
tweets = tweets.drop(['tweet_date_created'], axis=1)

In [3]:
duplicates = tweets[tweets.duplicated(subset=['tweet_id'], keep=False)]

if not duplicates.empty:
    print(f"Found {len(duplicates)} duplicate tweet ids. Removing duplicates...")
    tweets.drop_duplicates(subset=['tweet_id'], inplace=True)
else:
    print("No duplicate tweet ids found.")

Found 762643 duplicate tweet ids. Removing duplicates...


In [4]:
tweets = tweets.drop(['sentiment_score'], axis=1)

In [5]:
tweets = tweets.dropna()
print(tweets.isnull().sum())

all_english = (tweets['language'] == 'en').all()


if all_english:
    print("All values in the 'language' column are 'en'")
else:
    print("Not all values in the 'language' column are 'en'")

tweet_id      0
tweet_text    0
language      0
sentiment     0
dtype: int64
All values in the 'language' column are 'en'


In [6]:
tweets = tweets.drop(['language'], axis=1)

In [7]:
tweets = tweets.drop(['tweet_id'], axis=1)

unique_sentiments = tweets['sentiment'].unique()

# Print the unique sentiment values
print("Unique sentiment values in the 'sentiment' column:")
for sentiment in unique_sentiments:
    print(sentiment)

# Define a dictionary that maps sentiment labels to numeric values
sentiment_map = {"NEUTRAL": 0, "POSITIVE": 1, "MIXED": 2, "NEGATIVE": -1}

# Map the sentiment labels to their numeric values
tweets['sentiment_values'] = tweets['sentiment'].map(sentiment_map)

# Print the new column that contains the mapped values
print(tweets['sentiment_values'])

Unique sentiment values in the 'sentiment' column:
NEUTRAL
NEGATIVE
POSITIVE
MIXED
0          0
1          0
2          0
3         -1
4          0
          ..
5393957    0
5393958   -1
5393959    0
5393960    0
5393961    0
Name: sentiment_values, Length: 5012534, dtype: int64


In [8]:
tweets = tweets.drop(['sentiment'], axis=1)

In [9]:
tweets['sentiment_values']=tweets['sentiment_values'].replace(-1, 3)

In [10]:
# Find the indices of rows with Mixed sentiment
mixed_indices = tweets[tweets['sentiment_values'] == 2].index

# Delete the rows with Mixed sentiment
tweets = tweets.drop(mixed_indices)

# Print the resulting DataFrame
print(tweets.head())

                                          tweet_text  sentiment_values
0  Bayer Leverkusen goalkeeper Bernd Leno will no...                 0
1  Gary Speed v Blackburn at St James in 2001/02 ...                 0
2  @ChelseaFC Don't make him regret it and start ...                 0
3  @LiverpoolFF @AnfieldEdition He's a liar, made...                 3
4  @theesk @Everton Didn't realise Kenwright is d...                 0


In [11]:
positive_tweets = tweets[tweets['sentiment_values'] == 1]
negative_tweets = tweets[tweets['sentiment_values'] == 3]
neutral_tweets = tweets[tweets['sentiment_values'] == 0]
mixed_tweets = tweets[tweets['sentiment_values'] == 2]

print('No of positive tagged tweets is: {}'.format(len(positive_tweets)))
print('No of negative tagged tweets is: {}'.format(len(negative_tweets)))
print('No of neutral tagged tweets is: {}'.format(len(neutral_tweets)))
print('No of mixed tagged tweets is: {}'.format(len(mixed_tweets)))

No of positive tagged tweets is: 1070334
No of negative tagged tweets is: 354501
No of neutral tagged tweets is: 3549918
No of mixed tagged tweets is: 0


In [12]:
import pandas as pd

# Shuffle the DataFrame to ensure that the downsampling is random
tweets = tweets.sample(frac=1, random_state=42)

# Count the number of tweets in each sentiment class
counts = tweets['sentiment_values'].value_counts()

# Find the smallest class size
smallest_size = counts.min()

# Downsample each class to the smallest size
positive_tweets = tweets[tweets['sentiment_values'] == 1].sample(n=smallest_size, random_state=42)
negative_tweets = tweets[tweets['sentiment_values'] == 3].sample(n=smallest_size, random_state=42)
neutral_tweets = tweets[tweets['sentiment_values'] == 0].sample(n=smallest_size, random_state=42)

# Concatenate the downsampled DataFrames
tweets = pd.concat([positive_tweets, negative_tweets, neutral_tweets], ignore_index=True)

# Print the new counts of tweets in each class
print('No of positive tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 1])))
print('No of negative tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 3])))
print('No of neutral tagged tweets is: {}'.format(len(tweets[tweets['sentiment_values'] == 0])))


No of positive tagged tweets is: 354501
No of negative tagged tweets is: 354501
No of neutral tagged tweets is: 354501


In [15]:
tweets['tweet_text'] = tweets['tweet_text'].astype('str')

In [13]:
import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stopword = set(stopwords.words('english'))

In [14]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", 
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
     "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [16]:
def convert_abbrev_in_text(tweets):
    t = []
    words = tweets.split() 
    t = [abbreviations[w.lower()] if w.lower() in abbreviations.keys() else w for w in words]

In [17]:
import re

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove whitespace
    text = text.strip()
    return text

In [18]:
import pandas as pd
from sklearn.utils import shuffle

# Apply the preprocessing function to the 'text' column
tweets['processed_text'] = tweets['tweet_text'].apply(preprocess_text)

tweets =  shuffle(tweets).reset_index(drop=True)
tweets.head(5)

Unnamed: 0,tweet_text,sentiment_values,processed_text
0,Wu Lei got 20 in 28 last year (Chinese seasons...,0,wu lei got 20 in 28 last year chinese seasons ...
1,@Ultra_Suristic @Mateo_Kova23 @realmadrid Does...,0,doesnt seem like he wants to come back
2,@MorataAm9 @ChelseaFC Still waiting you to fol...,0,still waiting you to follow me back dude
3,"No luck today for @mbatshuayi , clearance off ...",1,no luck today for clearance off the line hit...
4,#nufc #newcastle FRAMED Bob Moncur Newcastle U...,0,framed bob moncur newcastle united hand signed...


In [19]:
tokenized_tweet=tweets['processed_text'].apply(lambda x: x.split())
tokenized_tweet.head(5)

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(stop_words='english',tokenizer = token.tokenize)
text_counts = cv.fit_transform(tweets['processed_text'].values.astype('U'))

tweets.head(5)



Unnamed: 0,tweet_text,sentiment_values,processed_text
0,Wu Lei got 20 in 28 last year (Chinese seasons...,0,wu lei got 20 in 28 last year chinese seasons ...
1,@Ultra_Suristic @Mateo_Kova23 @realmadrid Does...,0,doesnt seem like he wants to come back
2,@MorataAm9 @ChelseaFC Still waiting you to fol...,0,still waiting you to follow me back dude
3,"No luck today for @mbatshuayi , clearance off ...",1,no luck today for clearance off the line hit...
4,#nufc #newcastle FRAMED Bob Moncur Newcastle U...,0,framed bob moncur newcastle united hand signed...


In [19]:
tokenized_tweet=tweets['processed_text'].apply(lambda x: x.split())
tokenized_tweet.head(5)

0    [the, problems, in, this, club, are, ed, woodw...
1    [dont, know, about, any, other, united, fans, ...
2    [are, you, kidding, 42, minuites, to, give, so...
3    [hello, its, the, world, here, ive, officially...
4                       [customer, service, is, worse]
Name: processed_text, dtype: object

In [30]:
tokenized_tweet.head(5)

0    [the, problems, in, this, club, are, ed, woodw...
1    [dont, know, about, any, other, united, fans, ...
2    [are, you, kidding, 42, minuites, to, give, so...
3    [hello, its, the, world, here, ive, officially...
4                       [customer, service, is, worse]
Name: processed_text, dtype: object

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import nltk
import dill

In [21]:
# Split the dataset into input (X) and output (y)
X = tweets['processed_text'].values
y = tweets['sentiment_values'].values

# Split the dataset into 70% training and 30% combined validation and testing
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Split the temporary dataset (30% of the entire dataset) into 50% validation and 50% testing
# This results in 15% validation and 15% testing of the entire dataset
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print("Training data size:", len(X_train))
print("Validation data size:", len(X_val))
print("Testing data size:", len(X_test))

Training data size: 744452
Validation data size: 159525
Testing data size: 159526


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming your dataset is stored in the 'processed_text' column of the DataFrame 'tweets'
documents = tweets['processed_text'].values

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)

# The number of rows in the DTM corresponds to the number of documents
number_of_documents = X.shape[0]

print("Total number of documents in the dataset:", number_of_documents)


Total number of documents in the dataset: 1063503


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Top 5 tweets from the 'processed_text' column
top_5_tweets = tweets.loc[:4, 'processed_text'].tolist()

# Create the TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer to the data and transform the data into a document-term matrix (DTM)
dtm = vectorizer.fit_transform(top_5_tweets)

# Convert the DTM to an array and print it
dtm_array = dtm.toarray()
print("Document-Term Matrix:\n", dtm_array)

# Print the feature names
feature_names = vectorizer.get_feature_names_out()
print("\nFeature Names:\n", feature_names)

# Print the TF-IDF values for each term in each tweet
for i, tweet in enumerate(top_5_tweets):
    print("\nTweet {}: {}".format(i + 1, tweet))
    for j, feature in enumerate(feature_names):
        print("  {}: {:.4f}".format(feature, dtm_array[i][j]))


AttributeError: 'list' object has no attribute 'loc'