<a href="https://colab.research.google.com/github/guna-20/twitter/blob/main/twitter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import re
from bs4 import BeautifulSoup
from google.colab import drive
from tensorflow.keras.models import Sequential
import pandas as pd


In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
cols=['sentiment','id','date','query','user','text']
df=pd.read_csv("/content/drive/MyDrive/Projects/Twitter_sentiment_analysis/training.1600000.processed.noemoticon.csv",
               engine='python',
               header=None,
               names=cols,
               encoding="Latin1")
#   Latin1 encoding because it accept any possible byte as input, convert into unicode character

df.drop(['id','date','query','user'],axis=1,inplace=True)
df.tail()

Unnamed: 0,sentiment,text
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,happy #charitytuesday @theNSPCC @SparksCharity...


In [None]:
def clean_data(tweet):
  tweet = BeautifulSoup(tweet,"lxml").get_text()
  #  "<html><head><title>test<body><h1>page title</h3>" ----> test page title
  
  tweet = re.sub(r"@[a-zA-Z0-9]+",'',tweet)
  #  "@hello i am good" -----> "i am good"
  
  tweet = re.sub(r"https?://[a-zA-Z0-9./]+",'',tweet)
  #  "https://zindi.africa/competitions" -----> ""
  
  tweet = re.sub(r"[^a-zA-Z.!?']+"," ",tweet)
  #   "*hello i am good 900" -----> hollo i am good
  
  tweet = re.sub(r" +"," ",tweet)
  #   double spaces are removed
  return tweet

In [None]:
cleaned_data=[clean_data(tweet) for tweet in df.text]

In [None]:
df_labels=df.sentiment.values
df_labels[df_labels==4]=1

In [None]:
cleaned_data[0:2]

[" Awww that's a bummer. You shoulda got David Carr of Third Day to do it. D",
 "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!"]

In [None]:

vocab_size = 2000000
embedding_dim = 16
max_length = 200
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cleaned_data, df_labels, test_size=0.30, random_state=42)

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
# num_words is no. of words it should take into account. and num_words should ve as much as possible (without stop words)

tokenizer.fit_on_texts(cleaned_data)
word_index = tokenizer.word_index
#  "The cat sat on the mat."  -----> {'cat': 2, 'mat': 5, 'on': 4, 'sat': 3, 'the': 1}
# lesser the word_index value, it has occured frequently in the context

training_sequences = tokenizer.texts_to_sequences(X_train)
# values are assigned in place of words based on the word index
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
# trunc removing extra words
# adding "0"s to sequence

testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
#We store this tokenizer in a file to use later in web app
import pickle
# saving
with open('/content/drive/MyDrive/Projects/Twitter sentiment analysis/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPool1D, Dense, Dropout

In [None]:
VOCAB_SIZE =vocab_size
EMB_DIM = 200
nb_filters  = 50  
FFN_units = 200
NB_CLASSES = 2
dropout_rate = 0.2
BATCH_SIZE = 32
NB_EPOCHS = 2

In [None]:
model1 = Sequential()
#  it does not allow you to create models that share layers or have multiple inputs or outputs

model1.add(tf.keras.layers.Embedding( vocab_size, embedding_dim, input_length=max_length))
# Word embeddings provide a dense representation of words and their relative meanings.
# input_dim: This is the size of the vocabulary in the text data.
# embeding_dim : a vector space of 16 dimensions
# input_length : length of each input document 

model1.add(Conv1D(filters=nb_filters,
                                kernel_size=2,
                                padding = 'valid',
                                activation = "relu"))
# "valid" applies padding to the input sequence so the output size shrinks by filter_size - 1. No padding occurs.
# relu because return positive values and negative values as zero.
model1.add(Conv1D(filters = nb_filters,
                                 kernel_size = 3,
                                 padding = "valid",
                                 activation = "relu"))
model1.add(Conv1D(filters = nb_filters,
                                 kernel_size = 4,
                                 padding = "valid",
                                activation = 'relu'))
model1.add(GlobalMaxPool1D())
# Downsamples the input representation by taking the maximum value over the time dimension.
# shape = input_length,input_channel

model1.add(Dense(units = FFN_units,activation = "relu"))
model1.add(Dropout(rate = dropout_rate))
# dropout layer is to prevent overfitting.

model1.add(Dense(1, activation='sigmoid'))
# "sigmoid" because we want output ranging between 0 and 1


In [None]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 16)           32000000  
_________________________________________________________________
conv1d (Conv1D)              (None, 199, 50)           1650      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 197, 50)           7550      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 194, 50)           10050     
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 200)               10200     
_________________________________________________________________
dropout (Dropout)            (None, 200)               0

In [None]:
model1.compile(loss = "binary_crossentropy",
               optimizer = 'adam',
               metrics = ['accuracy'])

# binary_crossentropy because its a binary__classificartion problem (  -(1/N) sum(y * log(yhat) + (1 -y) * log(1 - yhat)) )
# optimizer deals with learning rate and when should update weight.

In [None]:

checkpoint_path = "/content/drive/MyDrive/Projects/Twitter sentiment analysis/checkpoint/"

ckpt = tf.train.Checkpoint(model1)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [None]:
model1.fit(training_padded,y_train,validation_data= (testing_padded,y_test),batch_size = 2**8, epochs = NB_EPOCHS, verbose =1) 
ckpt_manager.save()

Epoch 1/2
Epoch 2/2


'/content/drive/MyDrive/Projects/Twitter sentiment analysis/checkpoint/ckpt-1'

In [None]:
predict_1 = model1.predict(testing_padded,batch_size = 2**8,verbose=1)



In [None]:
ori = lambda x:0 if x<0.5 else 1

In [None]:
predict_1_y = list(map(ori,predict_1))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predict_1_y))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82    239361
           1       0.83      0.81      0.82    240639

    accuracy                           0.82    480000
   macro avg       0.82      0.82      0.82    480000
weighted avg       0.82      0.82      0.82    480000



In [None]:
sen = ["i love him"]
tokenizer=tfds.deprecated.text.Tokenizer()
input_data1 = [tokenizer.tokenize(i) for i in sen]

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
sen = tokenizer.texts_to_sequences(input_data1)
#sen= pad_sequences(sen, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
vocab_size

1120000

In [None]:
reviews = ['they made me cry', 'I hate spaghetti',
           "he couldn't make it", 
                'Everything was good',
                'he is a theif', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
              "everyone was happy "]

ori = lambda x:0 if x<0.5 else 1
# Create the sequences
padding_type='post'
sample_data=[clean_data(tweet) for tweet in reviews]
#sample_stop = [stop_remove(i) for i in sample_data]
sample_sequences = tokenizer.texts_to_sequences(sample_data)
reviews_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=max_length)           
classes = model1.predict(reviews_padded)
classes_y=list( map(ori,classes))
for i in range(len(reviews)):
  print(reviews[i],classes_y[i])


they made me cry 0
I hate spaghetti 0
he couldn't make it 0
Everything was good 1
he is a theif 1
Everything was green 1
the host seated us immediately 1
they gave us free chocolate cake 1
not sure about the wilted flowers on the table 1
only works when I stand on tippy toes 0
everyone was happy  1


In [None]:
model1.save("/content/drive/MyDrive/Projects/Twitter sentiment analysis/sentient_model1.h5")

**For web app**

In [2]:
!pip install streamlit

Collecting streamlit
[?25l  Downloading https://files.pythonhosted.org/packages/b7/98/4725661dc5719c05ba7e3f9744407ce91e2d982cb6c9601de2bbb62e2dd0/streamlit-0.81.0-py2.py3-none-any.whl (8.2MB)
[K     |████████████████████████████████| 8.2MB 17.2MB/s 
Collecting gitpython
[?25l  Downloading https://files.pythonhosted.org/packages/a6/99/98019716955ba243657daedd1de8f3a88ca1f5b75057c38e959db22fb87b/GitPython-3.1.14-py3-none-any.whl (159kB)
[K     |████████████████████████████████| 163kB 45.8MB/s 
Collecting pydeck>=0.1.dev5
[?25l  Downloading https://files.pythonhosted.org/packages/d6/bc/f0e44828e4290367c869591d50d3671a4d0ee94926da6cb734b7b200308c/pydeck-0.6.2-py2.py3-none-any.whl (4.2MB)
[K     |████████████████████████████████| 4.2MB 36.5MB/s 
Collecting watchdog; platform_system != "Darwin"
[?25l  Downloading https://files.pythonhosted.org/packages/d2/b2/b4ebe23174fd00ec94ac3f58ebf85f1090c49858feab1ca62ed7ea4d2f2f/watchdog-2.0.3-py3-none-manylinux2014_x86_64.whl (74kB)
[K     |█

In [22]:
import pickle
from tensorflow.keras.models import load_model
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from bs4 import BeautifulSoup
import streamlit as st

In [2]:
def clean_data(tweet):
  tweet = BeautifulSoup(tweet,"lxml").get_text()
  tweet = re.sub(r"@[a-zA-Z0-9]+",'',tweet)
  tweet = re.sub(r"https?://[a-zA-Z0-9./]+",'',tweet)
  tweet = re.sub(r"[^a-zA-Z.!?']+"," ",tweet)
  tweet = re.sub(r" +"," ",tweet)

  return tweet

In [7]:
with open("/content/drive/MyDrive/Projects/Twitter_sentiment_analysis/tokenizer.pickle", "rb") as handle:
 tokenizer = pickle.load(handle)
model=load_model('/content/drive/MyDrive/Projects/Twitter_sentiment_analysis/sentient_model1.h5')

In [8]:
reviews = ['they made me cry', 'I hate spaghetti',
           "he couldn't make it", 
                'Everything was good',
                'he is a theif', 
                'Everything was green', 
                'the host seated us immediately',
                'they gave us free chocolate cake', 
                'not sure about the wilted flowers on the table',
                'only works when I stand on tippy toes', 
              "everyone was happy "]

ori = lambda x:1 if x >0.6 else 0
# Create the sequences
padding_type='post'
sample_data=[clean_data(tweet) for tweet in reviews]
#sample_stop = [stop_remove(i) for i in sample_data]
sample_sequences = tokenizer.texts_to_sequences(sample_data)
reviews_padded = pad_sequences(sample_sequences, padding=padding_type, maxlen=200)           
classes = model.predict(reviews_padded)
classes_y=list(map(ori,classes))
for i in range(len(reviews)):
  print(reviews[i],classes_y[i])






they made me cry 0
I hate spaghetti 0
he couldn't make it 0
Everything was good 1
he is a theif 1
Everything was green 1
the host seated us immediately 1
they gave us free chocolate cake 1
not sure about the wilted flowers on the table 1
only works when I stand on tippy toes 0
everyone was happy  1


In [9]:
def predict(message):
 model=load_model('/content/drive/MyDrive/Projects/Twitter_sentiment_analysis/sentient_model1.h5')
 with open("/content/drive/MyDrive/Projects/Twitter_sentiment_analysis/tokenizer.pickle", "rb") as handle:
  tokenizer = pickle.load(handle)
 x_1 = tokenizer.texts_to_sequences([message])
 x_1 = pad_sequences(x_1, maxlen=500)
 predictions = model.predict(x_1)[0][0]
 ori = 1 if predictions >0.6 else 0
 return (predictions,ori)

In [10]:
predict("i have to much of workload ")





(0.22874278, 0)

In [13]:
st.title("Movie Review Sentiment Analyzer")
message = st.text_area("Enter Review",'Type Here ..')
if st.button('Analyze'):
  with st.spinner('Analyzing the text …'):
    prediction=predict(message)
    if prediction > 0.6:
      st.success('Positive review with {:.2f} confidence'.format(prediction))
      st.balloons()
    elif prediction <0.4:
      st.error('Negative review with {:.2f} confidence'.format(1-prediction))
    else:
      st.warning('Not sure! Try to add some more words')

  command:

    streamlit run /usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py [ARGUMENTS]


In [29]:
%%writefile app.py
import pickle
from tensorflow.keras.models import load_model
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
import streamlit as st

def predict(message):
 model=load_model('/content/drive/MyDrive/Projects/Twitter_sentiment_analysis/sentient_model1.h5')
 with open("/content/drive/MyDrive/Projects/Twitter_sentiment_analysis/tokenizer.pickle", "rb") as handle:
  tokenizer = pickle.load(handle)
 x_1 = tokenizer.texts_to_sequences([message])
 x_1 = pad_sequences(x_1, maxlen=500)
 predictions = model.predict(x_1)[0][0]
 ori = 1 if predictions >0.6 else 0
 return (predictions,ori)
st.title("Movie Review Sentiment Analyzer")
message = st.text_area("Enter Review",'Type Here ..')
if st.button('Analyze'):
  with st.spinner('Analyzing the text …'):
    prediction=predict(message)
    if prediction > 0.6:
      st.success('Positive review with {:.2f} confidence'.format(prediction))
      st.balloons()
    elif prediction <0.4:
      st.error('Negative review with {:.2f} confidence'.format(1-prediction))
    else:
      st.warning('Not sure! Try to add some more words')


Overwriting app.py


In [30]:
run app.py 

In [31]:
%%writefile requirements.txt
pickle
tensorflow
streamlit

Writing requirements.txt
