In [162]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
import pandas as pd
import glob
import os
from collections import defaultdict
import numpy as np
from textblob import TextBlob
from sklearn.metrics import f1_score, classification_report
import re


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nathan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [163]:
def categorize_vader(sentiment):
    if sentiment > 0.05:
        return 1
    elif sentiment < -0.05:
        return -1
    else:
       return 0

def categorize_textblob(sentiment):
    if sentiment> 0.05:
        return 1
    elif sentiment < -0.05:
        return -1
    else:
       return 0


In [164]:
labeled_tweets = pd.read_csv("manual_labeled_data.csv")

In [165]:
labeled_tweets = labeled_tweets[labeled_tweets["label"] != np.nan]

In [166]:
labeled_tweets = labeled_tweets.dropna()

In [167]:
good_tweets = labeled_tweets[(labeled_tweets["label"] == "1") | (labeled_tweets["label"] == "0") | (labeled_tweets["label"] == "-1" )]

In [168]:
good_tweets["label"] = good_tweets["label"].apply(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  good_tweets["label"] = good_tweets["label"].apply(int)


In [169]:
good_tweets = good_tweets.set_index("tweet_id")

# Process tweets to remove @ and urls

In [170]:
def process_tweet(text):
    text = re.sub(r"(?:\@+|https?\://)\S+|", "", text)
    return text


In [171]:
good_tweets["tweet"] = good_tweets["orig_tweet"].apply(lambda x : process_tweet(x))

# run vader to find sentiment on tweets


In [172]:
good_tweets["vader"] = good_tweets["tweet"].apply(sid.polarity_scores)
good_tweets["vader"] = good_tweets["vader"].apply(lambda x : x["compound"])

In [176]:
good_tweets[(good_tweets["vader"] < -0.9) & (good_tweets["label"] == 1)].to_csv("vader_big_wrong.csv")

In [116]:
good_tweets["vader_category"] = good_tweets["vader"].apply(categorize_vader)

In [117]:
good_tweets["vader_category"].value_counts()

 1    699
-1    492
 0    398
Name: vader_category, dtype: int64

In [118]:
print(classification_report(good_tweets["label"], good_tweets["vader_category"]))

              precision    recall  f1-score   support

          -1       0.67      0.60      0.63       556
           0       0.36      0.40      0.38       353
           1       0.60      0.62      0.61       680

    accuracy                           0.56      1589
   macro avg       0.54      0.54      0.54      1589
weighted avg       0.57      0.56      0.57      1589



# run textblob to find sentiment

In [119]:
good_tweets["textblob"] = good_tweets["tweet"].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)

In [120]:
good_tweets["textblob_category"] = good_tweets["textblob"].apply(categorize_textblob)

In [121]:
good_tweets["textblob_category"].value_counts()

 1    700
 0    594
-1    295
Name: textblob_category, dtype: int64

In [122]:
print(classification_report(good_tweets["label"], good_tweets["textblob_category"]))

              precision    recall  f1-score   support

          -1       0.59      0.31      0.41       556
           0       0.25      0.42      0.31       353
           1       0.53      0.54      0.53       680

    accuracy                           0.44      1589
   macro avg       0.46      0.43      0.42      1589
weighted avg       0.49      0.44      0.44      1589



# bag of words vectorization naieve bayes 


In [123]:
#from https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/
#Loading the Dataset
data = good_tweets

#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
#cv = TfidfVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)

text_counts = cv.fit_transform(data['tweet'])
#Splitting the data into trainig and testing
from sklearn.model_selection import train_test_split
import random
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['label'], test_size=0.25, random_state=random.randint(0,100000))



In [151]:
#Training the model
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
#Caluclating the accuracy score of the model
from sklearn import metrics
bayes_pred = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(bayes_pred, Y_test)

In [152]:
print(classification_report(Y_test, bayes_pred))

              precision    recall  f1-score   support

          -1       0.70      0.82      0.75       125
           0       0.61      0.31      0.41        98
           1       0.69      0.81      0.75       175

    accuracy                           0.69       398
   macro avg       0.67      0.64      0.64       398
weighted avg       0.68      0.69      0.66       398



In [126]:
good_tweets["bayes_category"] = MNB.predict(cv.transform(good_tweets["tweet"]))

In [127]:
#run linear regression with same tokenized data
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
regression = lr_model.fit(X_train, Y_train)
lr_predications = lr_model.predict(X_test)

In [130]:
print(classification_report(Y_test, lr_predications))

              precision    recall  f1-score   support

          -1       0.72      0.70      0.71       125
           0       0.57      0.42      0.48        98
           1       0.71      0.83      0.77       175

    accuracy                           0.69       398
   macro avg       0.67      0.65      0.65       398
weighted avg       0.68      0.69      0.68       398



In [133]:
token

RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)

In [136]:
from sklearn.svm import SVC
clf = SVC().fit(X_train, Y_train)
svc_pref = clf.predict(X_test)

In [137]:
print(classification_report(Y_test, svc_pref))

              precision    recall  f1-score   support

          -1       0.80      0.66      0.72       125
           0       0.55      0.28      0.37        98
           1       0.64      0.90      0.75       175

    accuracy                           0.67       398
   macro avg       0.66      0.61      0.61       398
weighted avg       0.67      0.67      0.65       398



In [156]:
from sklearn.model_selection import cross_val_score 
svc_crossval = cross_val_score(SVC(), text_counts, data["label"], cv=20, scoring='f1_weighted')
sum(svc_crossval) / len(svc_crossval)

0.6296699073230545

In [155]:
MNB_crossval = cross_val_score(MNB, text_counts, data["label"], cv=20, scoring='f1_weighted')
sum(MNB_crossval) / len(MNB_crossval)

0.6559774168362332

In [157]:
lr_crossval = cross_val_score(lr_model, text_counts, data["label"], cv=20, scoring='f1_weighted')
sum(MNB_crossval) / len(MNB_crossval)

0.6559774168362332

In [131]:
good_tweets.to_csv("tweets_with_label_and_pretrained.csv")