In [1]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
import pandas as pd
import glob
import os
from collections import defaultdict
import numpy as np
from textblob import TextBlob
from sklearn.metrics import f1_score, classification_report


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nathan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
def categorize_vader(sentiment):
    if sentiment['compound'] > 0.05:
        return 1
    elif sentiment['compound'] < -0.05:
        return -1
    else:
       return 0

def categorize_textblob(sentiment):
    if sentiment> 0.05:
        return 1
    elif sentiment < -0.05:
        return -1
    else:
       return 0


In [3]:
labeled_tweets = pd.read_csv("manual_labeled_data.csv")

In [4]:
labeled_tweets = labeled_tweets[labeled_tweets["label"] != np.nan]

In [5]:
labeled_tweets = labeled_tweets.dropna()

In [6]:
good_tweets = labeled_tweets[(labeled_tweets["label"] == "1") | (labeled_tweets["label"] == "0") | (labeled_tweets["label"] == "-1" )]

In [7]:
good_tweets["label"] = good_tweets["label"].apply(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  good_tweets["label"] = good_tweets["label"].apply(int)


In [8]:
good_tweets = good_tweets.set_index("tweet_id")

# run vader to find sentiment on tweets


In [9]:
good_tweets["vader"] = good_tweets["tweet"].apply(sid.polarity_scores)

In [10]:
good_tweets["vader_category"] = good_tweets["vader"].apply(categorize_vader)

In [11]:
good_tweets["vader_category"].value_counts()

 1    704
-1    488
 0    397
Name: vader_category, dtype: int64

In [12]:
print(classification_report(good_tweets["label"], good_tweets["vader_category"]))

              precision    recall  f1-score   support

          -1       0.64      0.56      0.60       556
           0       0.37      0.41      0.39       353
           1       0.59      0.61      0.60       680

    accuracy                           0.55      1589
   macro avg       0.53      0.53      0.53      1589
weighted avg       0.56      0.55      0.55      1589



# run textblob to find sentiment

In [13]:
good_tweets["textblob"] = good_tweets["tweet"].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)

In [14]:
good_tweets["textblob_category"] = good_tweets["textblob"].apply(categorize_textblob)

In [15]:
good_tweets["textblob_category"].value_counts()

 1    654
 0    640
-1    295
Name: textblob_category, dtype: int64

In [16]:
print(classification_report(good_tweets["label"], good_tweets["textblob_category"]))

              precision    recall  f1-score   support

          -1       0.54      0.28      0.37       556
           0       0.24      0.44      0.31       353
           1       0.53      0.51      0.52       680

    accuracy                           0.41      1589
   macro avg       0.43      0.41      0.40      1589
weighted avg       0.47      0.41      0.42      1589



# bag of words vectorization naieve bayes 


In [17]:
#from https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/
#Loading the Dataset
data = good_tweets

#Pre-Prcoessing and Bag of Word Vectorization using Count Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')

cv = CountVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
#cv = TfidfVectorizer(stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)

text_counts = cv.fit_transform(data['tweet'])
#Splitting the data into trainig and testing
from sklearn.model_selection import train_test_split
import random
X_train, X_test, Y_train, Y_test = train_test_split(text_counts, data['label'], test_size=0.25, random_state=random.randint(0,100000))
#Training the model
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(X_train, Y_train)
#Caluclating the accuracy score of the model
from sklearn import metrics
predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)
print(classification_report(Y_test, predicted))

              precision    recall  f1-score   support

          -1       0.81      0.76      0.79       145
           0       0.59      0.44      0.50        89
           1       0.69      0.82      0.75       164

    accuracy                           0.71       398
   macro avg       0.70      0.67      0.68       398
weighted avg       0.71      0.71      0.71       398



In [18]:
print(classification_report(Y_test, predicted))

              precision    recall  f1-score   support

          -1       0.81      0.76      0.79       145
           0       0.59      0.44      0.50        89
           1       0.69      0.82      0.75       164

    accuracy                           0.71       398
   macro avg       0.70      0.67      0.68       398
weighted avg       0.71      0.71      0.71       398



In [19]:
good_tweets["bayes_category"] = MNB.predict(cv.transform(good_tweets["tweet"]))

In [20]:
good_tweets.to_csv("tweets_with_label_and_pretrained.csv")

In [21]:
"""#from https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/
#Importing necessary libraries
import nltk
import pandas as pd
from textblob import Word
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split 
#Loading the dataset
data = good_tweets
#Pre-Processing the text 
def cleaning(df, stop_words):
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join(x.lower() for x in x.split()))
    # Replacing the digits/numbers
    df['tweet'] = df['tweet'].str.replace('d', '')
    # Removing stop words
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))
    # Lemmatization
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([Word(x).lemmatize() for x in x.split()]))
    return df
stop_words = stopwords.words('english')
data_cleaned = cleaning(data, stop_words)
#Generating Embeddings using tokenizer
tokenizer = Tokenizer(num_words=500, split=' ') 
tokenizer.fit_on_texts(data_cleaned['tweet'].values)
X = tokenizer.texts_to_sequences(data_cleaned['tweet'].values)
X = pad_sequences(X)
#Model Building
model = Sequential()
model.add(Embedding(700, 120, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(704, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(352, activation='LeakyReLU'))
model.add(Dense(3, activation='softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
print(model.summary())
#Model Training
import random
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X, data['label']+np.ones(data["label"].shape), test_size=0.25, random_state=random.randint(0,100000))
model.fit(X_train1, Y_train1, epochs = 10, batch_size=32, verbose =1)
"""

'#from https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/\n#Importing necessary libraries\nimport nltk\nimport pandas as pd\nfrom textblob import Word\nfrom nltk.corpus import stopwords\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import classification_report,confusion_matrix,accuracy_score\nfrom keras.models import Sequential\nfrom keras.preprocessing.text import Tokenizer\nfrom keras_preprocessing.sequence import pad_sequences\nfrom keras.layers import Dense, Embedding, LSTM, SpatialDropout1D\nfrom sklearn.model_selection import train_test_split \n#Loading the dataset\ndata = good_tweets\n#Pre-Processing the text \ndef cleaning(df, stop_words):\n    df[\'tweet\'] = df[\'tweet\'].apply(lambda x: \' \'.join(x.lower() for x in x.split()))\n    # Replacing the digits/numbers\n    df[\'tweet\'] = df[\'tweet\'].str.replace(\'d\', \'\')\n    # Removing stop words\n    df[\'tweet\'] = df[\'tweet\'].apply(lambda x: \' \'.join(x for x in x.sp

In [22]:
token

RegexpTokenizer(pattern='[a-zA-Z0-9]+', gaps=False, discard_empty=True, flags=re.UNICODE|re.MULTILINE|re.DOTALL)

In [28]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import pandas as pd
 
# Reads 'Youtube04-Eminem.csv' file 
df = good_tweets
 
comment_words = ''
stopwords = set(STOPWORDS)
 
# iterate through the csv file
for val in df.tweet:
     
    # typecaste each val to string
    val = str(val)
 
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    tokens = tokenizer.tokenize(val)
     
    comment_words += " ".join(tokens)+" "
print(comment_words)
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10,
                font_path="D:\coding\classes\Rhino-Sentiment-Analysis-Project\Arial.ttf")
cloud = wordcloud.generate(comment_words)
 
# plot the WordCloud image                       
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(cloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

dallaszoo wish mean fish mightijamie skip poach ceain game farm evidence caught camera publish peta sheldricktrust dame daphne sheldrick spent lifetime dedicate protect wildlife found matriarch pioneer milk form citysfelephants southafrica auction white auction ranch together world big concen suespurgin shock trophy hunt reveals southafrican safari rep urge endanger hunt specie go extinct disg aificial insemination expect help white avoid extinction fun fact horn animal detail tycoon john hume auction breeding project represent estimate world white population least unveiled opening bid set include white brevardzoo year care successful conception turn aificial insemination help female monicalasky oxpeckers help evade poachersalarmcalling oxpeckers significantly improve rate distance tha arvindinn themossadil mean say last isnt really nohern white secondlast isnt really year old girlim shock shock tell originalkoolala last male nohern white die yesterday specie exist least year survive p

ValueError: Only supported for TrueType fonts

In [None]:
"""#Model Testing
model.evaluate(X_test1,Y_test1)"""

'#Model Testing\nmodel.evaluate(X_test1,Y_test1)'