In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer

from googletrans import Translator
from deep_translator import GoogleTranslator
import string
import emoji
from bs4 import BeautifulSoup

from dotenv import load_dotenv
import os

import itertools
from html import unescape
import re 

import keras

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
df = pd.read_csv('/Users/alenjose/Desktop/data/project_data/self-scraped/processed/BLM_english_tweets.csv')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93976 entries, 0 to 93975
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             93976 non-null  int64  
 1   date                   93976 non-null  object 
 2   year                   93976 non-null  int64  
 3   month                  93976 non-null  int64  
 4   tweet_text_with_sw     93946 non-null  object 
 5   tweet_text_without_sw  93866 non-null  object 
 6   tweet_id               93976 non-null  float64
 7   user_name              93976 non-null  object 
 8   display_name           93967 non-null  object 
 9   user_id                93976 non-null  float64
 10  user_description       82560 non-null  object 
 11  user_follower_count    93976 non-null  int64  
 12  user_friend_count      93976 non-null  int64  
 13  user_statuses_count    93976 non-null  int64  
 14  tweet_reply_count      93976 non-null  int64  
 15  tw

In [13]:
df = df.drop(['Unnamed: 0','tweet_id','user_description','display_name'],axis=1)
df.head(2)

Unnamed: 0,date,year,month,tweet_text_with_sw,tweet_text_without_sw,user_name,user_id,user_follower_count,user_friend_count,user_statuses_count,tweet_reply_count,tweet_retweet_count,tweet_like_count,tweet_quote_count,tweet_language,tweet_mentioned_users
0,2022-04-22 17:26:15.207532763,2022,4,sjshsjjsh who crying me oh my god sis namjoon ...,sjshsjjsh who crying me oh god sis namjoon gon...,nadealeine,1.27835e+18,16,113,43640,0,0,0,0,id,
1,2021-03-23 10:27:38.540305625,2021,3,someone needs to check the keepers betway account,someone needs check keepers betway account,ArsenalBLM,1.29348e+18,181,512,229,0,0,0,0,en,


In [14]:
df_main = df

In [15]:
# removing translated tweets, because the tweets are too complex for the model to predict
df = df[df['tweet_language']== 'en']
print(len(df))
print(' ')
print(df.year.value_counts())
print(df.month.value_counts())

41782
 
2022    20139
2021    20112
2023     1531
Name: year, dtype: int64
1     5056
8     3435
10    3398
12    3397
7     3396
5     3392
3     3389
4     3319
6     3318
9     3295
11    3292
2     3095
Name: month, dtype: int64


In [16]:
# from langdetect import detect, LangDetectException

# def remove_non_english_words(text):
#     words = text.split()
#     english_words = []
#     for word in words:
#         try:
#             if detect(word) == 'en':
#                 english_words.append(word)
#         except LangDetectException:
#             pass
#     return ' '.join(english_words)

# df['tweet_without_username'] = df['tweet_without_username'].apply(remove_non_english_words)

In [17]:
# remove nulls
df_train = df[df['tweet_text_with_sw'].notnull()]

# load model
model_path = '/Users/alenjose/Desktop/data/project_data/model_info/my_lstm_model.h5'
model = keras.models.load_model(model_path)

# compile model with optimizer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [18]:
# preprocess tweets
vocab_size = 7000  # as used during training
oov_tok = '<OOV>'
max_length = 200   # as used during training
padding_type='post'
trunc_type='post'

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_train['tweet_text_with_sw'])

tweet_sequences = tokenizer.texts_to_sequences(df_train['tweet_text_with_sw'])
tweet_padded = pad_sequences(tweet_sequences, padding=padding_type, maxlen=max_length, truncating=trunc_type)

predictions = model.predict(tweet_padded)

sentiments = []

for pred in predictions:
    if pred > 0.5:
        sentiments.append('positive')
    elif pred < 0.5:
        sentiments.append('negative')
    else:
        sentiments.append('neutral')

df_train['sentiments'] = sentiments



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['sentiments'] = sentiments


In [19]:
df_train[['tweet_text_with_sw','sentiments']].sentiments.value_counts()

negative    20933
positive    20842
Name: sentiments, dtype: int64

In [11]:
# save the updated dataframe to CSV file
# df.to_csv('/Users/alenjose/Desktop/data/project_data/model_info/LSTM_results', index=False)

In [20]:
for i in df_train[df_train['sentiments'] == 'positive']['tweet_text_with_sw'].iloc[:10]:
    print(i)

someone needs to check the keepers betway account
consequences imagine that except at blm antifa riots sorry protests
during the blm riots in the summer of and beyond eric adams de blasio city council members didnt say or do anything to try to control the calamity that was occurring on daily basis in fact remember ds saying that riots are the voice of the unheard
guess yours has seen some shit its in his line of work
do me favor while youre sedition hunting track down the rapists arsonists looters and murderers who committed actual crimes during the st george floyd summer of love riots blm and anti fa stole lives and ruined livelihoods please thank you
he never touching anyone in my family or their medical decisions
if nothing happens soon to stop this madness biden and his gang won allow elections they will do anything and everything to stay in power that what irs agents fbi released prisoners military aged illegals antifa blm and ms is for their private army
antifa blm will with trum

In [23]:
df_train.to_csv('/Users/alenjose/Desktop/data/project_data/model_info/LSTM_BLM_prediction')