In [1]:
import zipfile
import pandas as pd
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

# Read Training Data
Data source: https://www.kaggle.com/datasets/yasserh/twitter-tweets-sentiment-dataset

Columns:
1. textID - unique ID for each piece of text
2. text - the text of the tweet
3. sentiment - the general sentiment of the tweet

In [2]:
archive = zipfile.ZipFile("data.zip", "r")
data = pd.read_csv(archive.open("train_multiclass.csv"), encoding_errors="replace")
# drop preprocessed text of the dataset to perform our own preprocessing
data.drop(columns="selected_text", inplace=True)
# renaming columns for easier access
data.columns = ["id", "raw_text", "label"]
data

Unnamed: 0,id,raw_text,label
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,positive
27479,ed167662a5,But it was worth it ****.,positive


Number of tweets per label:

In [3]:
data["label"].value_counts()

neutral     11118
positive     8582
negative     7781
Name: label, dtype: int64

We change the label to numerical values for our model

In [4]:
data.loc[data["label"]=="negative", "label"] = 0
data.loc[data["label"]=="neutral", "label"] = 1
data.loc[data["label"]=="positive", "label"] = 2
data

Unnamed: 0,id,raw_text,label
0,cb774db0d1,"I`d have responded, if I were going",1
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0
2,088c60f138,my boss is bullying me...,0
3,9642c003ef,what interview! leave me alone,0
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0
...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,0
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,0
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,2
27479,ed167662a5,But it was worth it ****.,2


# Data Preprocessing

Turning everything into lowercase characters

In [5]:
data["raw_text"] = data["raw_text"].astype(str)
data["text"] = [entry.lower() for entry in data["raw_text"]]
data.head()

Unnamed: 0,id,raw_text,label,text
0,cb774db0d1,"I`d have responded, if I were going",1,"i`d have responded, if i were going"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,sooo sad i will miss you here in san diego!!!
2,088c60f138,my boss is bullying me...,0,my boss is bullying me...
3,9642c003ef,what interview! leave me alone,0,what interview! leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,"sons of ****, why couldn`t they put them on t..."


Removing Stopwords (Common words like "my", "he", "is", ...)

In [6]:
stop = stopwords.words('english')
# we think 'no' and 'not' might be important words for the sentiment and don't want them to be removed
stop.remove("no")
stop.remove("not")

# data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Removing links, tags and several punctuations from tweets using regular expressions

In [7]:
data["text"] = data["text"].apply(lambda x: re.sub("http[s]?://\S+", "", x))
data["text"] = data["text"].apply(lambda x: re.sub("@\S+", "", x))
data["text"] = data["text"].apply(lambda x: re.sub("-|\.|,|'|\?|\!|`|\*", "", x))

data.head()

Unnamed: 0,id,raw_text,label,text
0,cb774db0d1,"I`d have responded, if I were going",1,id have responded if i were going
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,sooo sad i will miss you here in san diego
2,088c60f138,my boss is bullying me...,0,my boss is bullying me
3,9642c003ef,what interview! leave me alone,0,what interview leave me alone
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,sons of why couldnt they put them on the rel...


Word-Stemming

In [8]:
sstemmer = SnowballStemmer("english")
pstemmer = PorterStemmer()

data['text'] = data['text'].apply(lambda x: ' '.join([pstemmer.stem(word) for word in x.split()]))

Tokenize Words

In [9]:
data["text"] = [str(word_tokenize(entry)) for entry in data["text"]]

Our finished Data:

In [10]:
data.head()

Unnamed: 0,id,raw_text,label,text
0,cb774db0d1,"I`d have responded, if I were going",1,"['id', 'have', 'respond', 'if', 'i', 'were', '..."
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,"['sooo', 'sad', 'i', 'will', 'miss', 'you', 'h..."
2,088c60f138,my boss is bullying me...,0,"['my', 'boss', 'is', 'bulli', 'me']"
3,9642c003ef,what interview! leave me alone,0,"['what', 'interview', 'leav', 'me', 'alon']"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,"['son', 'of', 'whi', 'couldnt', 'they', 'put',..."


In [11]:
data["label"] = data["label"].astype(int)
data["label"]

0        1
1        0
2        0
3        0
4        0
        ..
27476    0
27477    0
27478    2
27479    2
27480    1
Name: label, Length: 27481, dtype: int32

# Model Training

Split Training/Test-Data and Transform into a vectorized form

In [12]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data["text"], data["label"], test_size=0.2, random_state=42)

In [13]:
Tfidf_vect = TfidfVectorizer(analyzer="word", strip_accents="unicode", stop_words=stop, min_df=10)
Data_Tfidf = Tfidf_vect.fit_transform(data["text"])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

In [14]:
print("# of Features:", len(Tfidf_vect.get_feature_names_out()))

# of Features: 2197


Naive Bayes Classifier

In [15]:
# model_selection.cross_validate(naive_bayes.MultinomialNB(), Data_Tfidf, data["label"], cv=10)

In [16]:
NB = naive_bayes.MultinomialNB()
NB.fit(X_train_Tfidf, y_train)

predictions_NB = NB.predict(X_test_Tfidf)
print(f"Naive Bayes Accuracy Score: {round(accuracy_score(predictions_NB, y_test)*100, 3)}%")

Naive Bayes Accuracy Score: 64.217%


Support Vector Machine

In [17]:
SVM = svm.SVC(C=1, kernel='linear', cache_size=2000, break_ties=True, decision_function_shape="ovr")
SVM.fit(X_train_Tfidf, y_train)

predictions_SVM = SVM.predict(X_test_Tfidf)
print(f"Support Vector Machine Accuracy Score: {round(accuracy_score(predictions_SVM, y_test)*100, 3)}%")

Support Vector Machine Accuracy Score: 71.239%


In [18]:
# from sklearn.model_selection import GridSearchCV

# params = {
#     "C": [1, 10, 100],
#     "kernel": ["linear", "rbf"],
#     "cache_size": 2000
# }

# GridSearch = GridSearchCV()
# GridSearch.fit

# Test on self-labelled game related tweets

In [27]:
testdata = pd.read_excel("testdata_games.xlsx", usecols=["text", "multiclass_label"])
# drop unlabelled data:
testdata = testdata.dropna()
testdata["multiclass_label"] = testdata["multiclass_label"].astype(int)

testdata

Unnamed: 0,multiclass_label,text
0,0,"Okay, someone explain to me why Fall Guys and ..."
1,1,Look at this key I discovered on the Sea of Th...
2,1,Going Live ! Solo Slooping on the Sea of Thiev...
3,1,@KirkRooster Why isn’t gow boat travel as real...
4,2,@ItsCsteph @JAAY_ROCK_ Sea of Thieves let's yo...
...,...,...
502,2,evo13 played Grand Theft Auto V (Xbox One) in ...
503,1,Grand Theft Auto: Vice City Stories (2006) htt...
504,2,BadassDutchMan is streaming Grand Theft Auto V...
557,1,Grand Theft Auto V - PlayStation 3 [MU0GCS8]\n...


In [28]:
print(f"Negative Sentiment (0): {testdata['multiclass_label'].value_counts()[0]} tweets")
print(f"Neutral Sentiment (1): {testdata['multiclass_label'].value_counts()[1]} tweets")
print(f"Positive Sentiment (2): {testdata['multiclass_label'].value_counts()[2]} tweets")

Negative Sentiment (0): 36 tweets
Neutral Sentiment (1): 335 tweets
Positive Sentiment (2): 136 tweets


Without our Data Preprocessing:

In [29]:
testdata_tfidf = Tfidf_vect.transform(testdata["text"])
predictions_NB = NB.predict(testdata_tfidf)
predictions_SVM = SVM.predict(testdata_tfidf)
print(f"Naive Bayes Accuracy Score: {round(accuracy_score(predictions_NB, testdata['multiclass_label'])*100, 3)}%")
print(f"Support Vector Machine Accuracy Score: {round(accuracy_score(predictions_SVM, testdata['multiclass_label'])*100, 3)}%")

Naive Bayes Accuracy Score: 62.525%
Support Vector Machine Accuracy Score: 67.456%


Performing Preprocessing

In [30]:
# testdata["processed_test"] = data["raw_text"].astype(str)
testdata["text"] = [entry.lower() for entry in testdata["text"]]

testdata["text"] = testdata["text"].apply(lambda x: re.sub("http[s]?://\S+", "", x))
testdata["text"] = testdata["text"].apply(lambda x: re.sub("@\S+", "", x))
testdata["text"] = testdata["text"].apply(lambda x: re.sub("-|\.|,|'|\?|\!|`|\*", "", x))

testdata["text"] = [str(word_tokenize(entry)) for entry in testdata["text"]]

testdata.head()

Unnamed: 0,multiclass_label,text
0,0,"['okay', 'someone', 'explain', 'to', 'me', 'wh..."
1,1,"['look', 'at', 'this', 'key', 'i', 'discovered..."
2,1,"['going', 'live', 'solo', 'slooping', 'on', 't..."
3,1,"['why', 'isn', '’', 't', 'gow', 'boat', 'trave..."
4,2,"['sea', 'of', 'thieves', 'lets', 'you', 'do', ..."


In [31]:
testdata_tfidf = Tfidf_vect.transform(testdata["text"])
predictions_NB = NB.predict(testdata_tfidf)
predictions_SVM = SVM.predict(testdata_tfidf)
print(f"Naive Bayes Accuracy Score: {round(accuracy_score(predictions_NB, testdata['multiclass_label'])*100, 3)}%")
print(f"Support Vector Machine Accuracy Score: {round(accuracy_score(predictions_SVM, testdata['multiclass_label'])*100, 3)}%")

Naive Bayes Accuracy Score: 66.272%
Support Vector Machine Accuracy Score: 67.85%
