In [1]:
import pandas as pd
import nltk
nltk.download("vader_lexicon")
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()
from sklearn.metrics import accuracy_score, classification_report

In [4]:
data = pd.read_csv('0819_UkraineCombinedTweetsDeduped.csv')

In [37]:
df_cleaned = data.copy()

# define regex pattern to keep alphanumeric characters and spaces
pattern = re.compile(r'[^a-zA-Z0-9\s]+')

# loop through each row of the dataframe
for i, row in df_cleaned.iterrows():
    # extract the text from the row
    text = row['text']
    
    # remove non-alphanumeric characters and lowercase the text
    clean_text = re.sub(pattern, '', text).lower()
    
    # update the clean_text column in the dataframe
    df_cleaned.at[i, 'clean_text'] = clean_text

In [43]:
polarity_scores = []
for text in df_cleaned["clean_text"]:
    score = sent_analyzer.polarity_scores(text)["compound"]
    polarity_scores.append(score)
df_cleaned["polarity_score"] = polarity_scores

# Create a new column with the sentiment categories
sentiment_categories = []
for score in df_cleaned["polarity_score"]:
    if score >= 0.05:
        category = "positive"
    elif score <= -0.05:
        category = "negative"
    else:
        category = "neutral"
    sentiment_categories.append(category)
df_cleaned["sentiment_category"] = sentiment_categories

In [45]:
sentiments = []
for i in range(len(df_cleaned)):
    text = df_cleaned.iloc[i]["clean_text"]
    scores = sent_analyzer.polarity_scores(text)
    max_score = max(scores, key=scores.get)
    if max_score == "pos":
        sentiments.append("positive")
    elif max_score == "neg":
        sentiments.append("negative")
    else:
        sentiments.append("neutral")
df_cleaned["dominant_sentiment"] = sentiments


In [46]:
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,userid,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetid,...,in_reply_to_screen_name,is_quote_status,quoted_status_id,quoted_status_userid,quoted_status_username,extractedts,polarity_score,sentiment_category,dominant_sentiment,clean_text
0,0,173212647,JoeMokolobetsi,Yeshua Hamashiach is THE answer | Romans 10:9-...,Afrika Borwa,219,197,4789,2010-07-31 19:09:22.000000,1560416252937617411,...,,False,0,0,,2022-08-19 08:07:26.836769,-0.6757,negative,neutral,dear vaccine advocate\n\ndo take the covid19 m...
1,1,335041409,XclusivasPuebla,Somos el periódico #ExclusivasPuebla| Investi...,"Puebla, México",1419,6402,70267,2011-07-14 02:02:24.000000,1560416256179707904,...,,False,0,0,,2022-08-19 07:51:50.523048,0.0,neutral,neutral,mundo \n\nal menos 6 muertos y 16 heridos en b...
2,2,1512400441103032323,ShelterAnimalUA,Shelter for abandoned dogs and cats. 1400 dogs...,Ukraine,782,109,1198,2022-04-08 12:02:47.000000,1560416257752666113,...,,False,0,0,,2022-08-19 05:12:06.194216,0.8225,positive,neutral,animal shelter dogs and cats we need your help...
3,3,1356632630662430722,DogandCatHelpe1,Shelter for abandoned dogs and cats. 1400 dogs...,Ukraine,5,39,690,2021-02-02 15:57:12.000000,1560416257790382081,...,,False,0,0,,2022-08-19 11:22:26.824532,0.6908,positive,neutral,welcome to our shelter\nlocated in ukraine kyi...
4,4,20297125,ElMananaOnline,Las mejores noticias de los dos Laredos y el m...,Nuevo Laredo,2269,17978,56188,2009-02-07 06:32:49.000000,1560416257937051648,...,,False,0,0,,2022-08-19 11:52:29.448634,0.0,neutral,neutral,tensin debido a que rusia sigue en pie en la p...


In [48]:
df_cleaned['dominant_sentiment'].value_counts()

neutral     47118
negative      772
positive      104
Name: dominant_sentiment, dtype: int64

In [49]:
accuracy = accuracy_score(df_cleaned['dominant_sentiment'], df_cleaned['sentiment_category'])

print("Accuracy: {}\n".format(accuracy))

# Show the classification report
print(classification_report(df_cleaned['dominant_sentiment'], df_cleaned['sentiment_category']))

Accuracy: 0.4993540859274076

              precision    recall  f1-score   support

    negative       0.02      0.35      0.04       772
     neutral       0.98      0.50      0.66     47118
    positive       0.01      0.99      0.02       104

    accuracy                           0.50     47994
   macro avg       0.34      0.61      0.24     47994
weighted avg       0.96      0.50      0.65     47994

