In [1]:
import zipfile
import pandas as pd
# import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('punkt')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

# Read Training Data
Data source: https://www.kaggle.com/datasets/kazanova/sentiment140

- target:   The polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

- ids:      The id of the tweet

- date:     The date of the tweet (Sat May 16 23:58:44 UTC 2009)

- flag:     The query (lyx). If there is no query, then this value is NO_QUERY.

- user:     The user that tweeted

- text:     The text of the tweet

In [None]:
archive = zipfile.ZipFile("training_data.zip", "r")
data = pd.read_csv(archive.open("training.1600000.processed.noemoticon.csv"), header=None, encoding_errors="replace")

There are no neutral labelled tweets.
So we change the labels to binary form.
(0 = negative
1 = positive)

In [None]:
data.loc[data[0]==4, 0] = 1
data.head()

For model training we only need the label and text columns

In [None]:
data = data.drop([1,2,3,4], axis=1)
data.columns = ["label","raw_text"]
data.head()

# Data Preprocessing

Turning everything into lowercase characters

In [None]:
data["text"] = [entry.lower() for entry in data["raw_text"]]
# data.head()
# data["text"] = data["raw_text"]

Removing Stopwords and Links and User-Tags

In [None]:
stop = stopwords.words('english')
# we think 'no' and 'not' might be important words for the sentiment and don't want them to be removed
stop.remove("no")
stop.remove("not")

# data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
data["text"] = data["text"].apply(lambda x: re.sub("http[s]?://\S+", "", x))
data["text"] = data["text"].apply(lambda x: re.sub("@\S+", "", x))
data["text"] = data["text"].apply(lambda x: re.sub("-|\.|,|'|\?|\!", "", x))

data.head()

Word-Stemming

In [None]:
# from nltk.stem import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer

# sstemmer = SnowballStemmer("english")
# pstemmer = PorterStemmer()

# data['text'] = data['text'].apply(lambda x: ' '.join([sstemmer.stem(word) for word in x.split()]))

Tokenize Words

In [2]:
# This process takes a few minutes. So you only need to do it once, save it to a csv-file and read that file
### comment these lines after running once:
# data["text"] = [word_tokenize(entry) for entry in data["text"]]
# data.to_csv('tokenized_data.csv', index=False)
###

data = pd.read_csv('tokenized_data.csv')

Our finished data:

In [3]:
data.head()

Unnamed: 0,label,raw_text,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","['awww', 'thats', 'a', 'bummer', 'you', 'shoul..."
1,0,is upset that he can't update his Facebook by ...,"['is', 'upset', 'that', 'he', 'cant', 'update'..."
2,0,@Kenichan I dived many times for the ball. Man...,"['i', 'dived', 'many', 'times', 'for', 'the', ..."
3,0,my whole body feels itchy and like its on fire,"['my', 'whole', 'body', 'feels', 'itchy', 'and..."
4,0,"@nationwideclass no, it's not behaving at all....","['no', 'its', 'not', 'behaving', 'at', 'all', ..."


# Model Training

In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data["text"], data["label"],test_size=0.20, random_state=11)

In [4]:
stop = stopwords.words('english')
# we think 'no' and 'not' might be important words for the sentiment and don't want them to be removed
stop.remove("no")
stop.remove("not")

In [5]:
Tfidf_vect = TfidfVectorizer(analyzer="word", strip_accents="unicode", stop_words=stop, min_df=10)
Data_Tfidf = Tfidf_vect.fit_transform(data["text"])

In [None]:
Data_Tfidf = Tfidf_vect.transform(data["text"])
# Train_X_Tfidf = Tfidf_vect.transform(Train_X)
# Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [6]:
print("# of Features:", len(Tfidf_vect.get_feature_names_out()))

# of Features: 35016


In [8]:
# Naive Bayes Classifier
Naive = naive_bayes.MultinomialNB()
# Naive.fit(Train_X_Tfidf, Train_Y)

# predictions_NB = Naive.predict(Test_X_Tfidf)
# print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

In [10]:
model_selection.cross_validate(naive_bayes.MultinomialNB(), Data_Tfidf, data["label"], cv=10)

{'fit_time': array([0.44991994, 0.4312098 , 0.46318412, 0.40697336, 0.41754484,
        0.41999984, 0.43296814, 0.4352262 , 0.44605184, 0.41123199]),
 'score_time': array([0.02795792, 0.03200078, 0.02700353, 0.03006029, 0.02797222,
        0.03000474, 0.02901101, 0.03000593, 0.02795315, 0.02800059]),
 'test_score': array([0.76240625, 0.7641125 , 0.7528875 , 0.76394375, 0.76575625,
        0.756375  , 0.76336875, 0.774675  , 0.7667    , 0.75955625])}

In [None]:
# Support Vector Machine Classifier
# SVM = svm.SVC(C=0.9, kernel='rbf', degree=3, gamma='auto', cache_size= 1000, max_iter=2500, decision_function_shape="ovo", random_state=10)
# SVM.fit(Train_X_Tfidf,Train_Y)

# predictions_SVM = SVM.predict(Test_X_Tfidf)
# print("SVM Accuracy Score -> ", accuracy_score(predictions_SVM, Test_Y)*100)