<a href="https://colab.research.google.com/github/eeyvee-0x4d/cs-thesis/blob/main/Notebook/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Clone repository</h1>

In [None]:
# Clone the entire repo.
!git clone -l -s https://github.com/eeyvee-0x4d/cs-thesis thesis
%cd thesis
!ls

In [None]:
!pip install stopwordsiso
!pip install imbalanced-learn

from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

import nltk
import stopwordsiso

from nltk.stem import *
from nltk.corpus import stopwords
from nltk.util import ngrams

nltk.download("punkt")
nltk.download("stopwords")

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

import re
import json
import pandas as pd

<h1>Text Preprocessing</h1>
<ul>
  <li>Import dataset</li>
  <li>Remove urls</li>
  <li>Remove special characters</li>
  <li>Convert text data to lowercase</li>
</ul>

In [77]:
df = pd.read_csv('/content/thesis/Dataset/training_data.csv') #read csv

# remove urls, remove special chars, conver to lowercase
for i in range(df.shape[0]):
  string = re.sub(r'http\S+', '', df.at[i, 'Text']).lower()
  string = re.sub(r'[^a-zA-Z0-9 ]', '', string)
  string = re.sub(r'\b\w{1,3}\b', '', string)
  df.at[i, 'Text'] = re.sub(r'\n', ' ', string)

<h1>Natural Language Toolkit NLTK</h1>
<p>
Nltk will be used to preprocess to corpus.
</p>

<h1>Stemming words using Porter Stemmer</h1>
<p>
Apply porter stemmer to each tokens first then rebuild the tokens into sentence.
</p>
<hr>
<h1>Stop words removal</h1>
<p>
Remove stop words in english and tagalog.
</p>

In [78]:
stemmer = PorterStemmer() # Porter Stemmer

stopwords_eng = set(stopwords.words('english')) # English stopwords
stopwords_tl  = set(stopwordsiso.stopwords('tl'))
filtered_sentence = []
filtered_sentence2 = []

for i in range(len(df['Text'])):
  document = df.loc[i, 'Text']
  tokens = nltk.word_tokenize(document)

  filtered_sentence = [token for token in tokens if not token in stopwords_eng] # remove english stopwords
  filtered_sentence2 = [token for token in filtered_sentence if not token in stopwords_tl] #remove tagalog stopwords
  stemmed_tokens = [stemmer.stem(token) for token in filtered_sentence2] # stem each words

  document = " ".join(stemmed_tokens)
  df.loc[i, 'Text'] = document

# Create n-grams from 1-4

Create n-grams from 1 to 4 for exprementational purposes. Use params `ngram_range=(1,1)`, `ngram_range=(1,2)`, `ngram_range=(1,3)`, `ngram_range=(1,4)` in `TfidfVectorizer(ngram_range=(1,1))`.
Default is `ngram_range=(1,1)`



---


# Perform TF-IDF to the corpus


In [79]:
X = df['Text'] # tweets
y = df['Sentiment'] # labels

"""
X_train = train data
y_train = train data labels

X_test = test data
y_test = test data labels
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [80]:
# 1-gram
vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=10) # Initialize vectorizer
train_data_features = vectorizer.fit_transform(X_train)
test_data_features = vectorizer.transform(X_test)

In [59]:
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
x_train_oversampled, y_train_oversampled = oversample.fit_resample(train_data_features, y_train)

In [None]:
# parameters = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
parameters = {'alpha': [1]}

scorers = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

mnb = MultinomialNB()
classifier = GridSearchCV(mnb, parameters, return_train_score=False, cv=10, scoring=scorers, refit='accuracy')
classifier.fit(train_data_features, y_train)
# classifier.fit(x_train_oversampled, y_train_oversampled)

In [None]:
results = pd.DataFrame(classifier.cv_results_)
results[['param_alpha', 'mean_test_accuracy', 'mean_test_f1', 'mean_test_precision', 'mean_test_recall', 'rank_test_accuracy']]

In [None]:
y_pred = classifier.predict(test_data_features) # classifier predictions
y_true = y_test # ground truth

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)

print(f"Accuracy: {accuracy} F1 Score: {f1} Precision: {precision} Recall: {recall}")

from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print(f"\nConfusion Matrix: \nTrue Negative: {tn} False Positive: {fp} False Negative: {fn} True Positive: {tp}")

In [None]:
best = results.loc[results['rank_test_accuracy'] == 1]
best[['param_alpha', 'mean_test_accuracy', 'mean_test_f1', 'mean_test_precision', 'mean_test_recall', 'rank_test_accuracy']]

model_stats = {
  "accuracy": round(float(best['mean_test_accuracy'] * 100), 2),
  "precision": round(float(best['mean_test_precision'] * 100), 2),
  "recall": round(float(best['mean_test_recall'] * 100), 2),
  "f1score": round(float(best['mean_test_f1'] * 100), 2)
}

# convert into JSON:
model_stats = json.dumps(model_stats)

with open("model_stats.json", "w") as outfile:
    outfile.write(model_stats)

## Confusion Matrix

In [73]:
from sklearn.metrics import confusion_matrix

pred = classifier.predict(train_data_features)
tn, fp, fn, tp = confusion_matrix(y_train, pred).ravel()
print(f"True Negative: {tn} False Positive: {fp} False Negative: {fn} True Postive: {tp}")

True Negative: 193 False Positive: 73 False Negative: 30 True Postive: 709


In [None]:
import pickle

pickle.dump(classifier, open('model.pkl', 'wb'))
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))