# Sentiment Analysis of Twitter Data

<img src="https://hub.packtpub.com/wp-content/uploads/2018/03/Sentiment-Analysis-Tw.png">

# Stage 1: Importing dependencies

In [6]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
from google.colab import drive
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


# Stage 2: Data preprocessing

## Downloading Data
The data we gonna use in this exercise is hand­curated twitter sentiment dataset published by Sander’s Lab. It contains tweets from 2007­-2011 that mention one of four major Tech companies. Sander’s Lab manually assigned labels for each tweet as either “Positive”, “Negative”, “Neutral”, or “Irrelevant”.<br>

“Positive” and “Negative” indicated whether or not the tweet showed a generally favorable or
unfavorable opinion toward the mentioned company. A “Neutral” labelling indicated that the tweet
was either purely informative or the opinion of the tweet was otherwise ambiguous. <br>An “Irrelevant” labelling indicated that the tweet could not be determined to fit into any of the other
classes. <br>This often indicated that the tweet was not written in english or that it was clearly spam.

[You can read more about the data in the following link:](http://cs229.stanford.edu/proj2013/TatumSanchez-TwitterSentimentAnalysis.pdf)

In [7]:
%%capture
!wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
!unzip trainingandtestdata.zip

### Read train and test sets with Pandas

In [8]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv(
    "/content/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)
test_data = pd.read_csv(
   "/content/testdata.manual.2009.06.14.csv",
   header=None,
   names=cols,
   engine="python",
   encoding="latin1"
)

In [9]:
train_data.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Preprocessing

In [10]:
data = train_data

### Cleaning

In [11]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [12]:
len(data)

1600000

### Function - clean single tweet


*   Use BeautifulSoup tools to deal with different formats of the text (lxml). And it retuen a single (text) format.
*   Use regex to get rid of unwanted subexpressions.



In [13]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [14]:
data_clean = data.text.apply(clean_tweet)  # take about 7 mins

In [20]:
data_clean.head()

0     Awww that's a bummer. You shoulda got David C...
1    is upset that he can't update his Facebook by ...
2     I dived many times for the ball. Managed to s...
3      my whole body feels itchy and like its on fire 
4     no it's not behaving at all. i'm mad. why am ...
Name: text, dtype: object

### Data labeling

In [15]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [16]:
set(data_labels)

{0, 1}

### Tokenization
In this part we will create the Tokenizer Encoder.<br>
target_vocab_size=2**16 = ~64,000 - The number of all words in the English language

In [17]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16 
)  # takes about 8 - 9 mins

### Encode the tokens

In [18]:
data_inputs = data_clean.apply(tokenizer.encode) # takes about 6 mins

In [19]:
print(data_inputs[0])

[65316, 1570, 113, 65323, 10, 6, 3553, 1, 135, 5262, 50, 1484, 38165, 16, 13337, 606, 2, 49, 33, 1, 65352]


### Padding
We want all sentences to be the same length. Therefore, we will consider the longest sentence and define it as the desired length for all sentences. For every other sentence that is shorter than the longest sentence, we will pad the empty spaces with zeros.

In [21]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [22]:
print(data_inputs[0])

[65316  1570   113 65323    10     6  3553     1   135  5262    50  1484
 38165    16 13337   606     2    49    33     1 65352     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0]


### Spliting into training/testing set

In [23]:
test_idx = np.random.randint(0, 800000, 8000)
test_idx = np.concatenate((test_idx, test_idx+800000))

In [24]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

In [26]:
test_inputs.shape

(16000, 73)

In [27]:
train_inputs.shape

(1584100, 73)

# Stage 3: Model building

<figure>
<center>
<img src='https://i.stack.imgur.com/YREV2.png' width=50% hight=50%/>
<figcaption>Model Architecture</figcaption></center>
</figure>

In [17]:
class DCNN(tf.keras.Model):

    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)

        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.pool_1 = layers.GlobalMaxPool1D()
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.pool_2 = layers.GlobalMaxPool1D()
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        self.pool_3 = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool_2(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)

        merged = tf.concat([x_1, x_2, x_3], axis=-1) # shape = (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output

# Stage 4: Application

## Config



In [18]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

## Training

In [19]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [20]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

### Save checkpoints during training
You can use a trained model without having to retrain it, or pick-up training where you left off in case the training process was interrupted. The `tf.train.CheckpointManager` callback allows you to continually save the model both *during* and at *the end* of training.

In [21]:
checkpoint_path = "ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!")

In [22]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)     # 1 epoch = 15 to 18 mins
ckpt_manager.save()  

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'ckpt/ckpt-1'

## Evaluation

In [23]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

[0.5150845050811768, 0.8241875171661377]


### Making a single prediction

In [46]:
def get_single_prediction(given_text):
  token_and_encode = [tokenizer.encode(given_text)]
  result = Dcnn(np.array(token_and_encode), training=False).numpy()

  if (result[0][0] > 0.5) == 1:
    prediction = f'Positive (-:\nprobability of {result[0][0]}'
  else:
    prediction = f'Negative )-:\nprobability of {result[0][0]}'
  print(prediction)

### Let's try the trained model

In [47]:
get_single_prediction("i love deep learning very much")

Positive (-:
probability of 0.9955477714538574


In [65]:
get_single_prediction("the sun will shine againe tomorrow")

Positive (-:
probability of 0.8977813720703125


In [48]:
get_single_prediction("i wasn't expecting to have such a great time")

Positive (-:
probability of 0.7917581796646118


In [50]:
get_single_prediction("i hate rainy days ")

Negative )-:
probability of 0.01484485063701868


In [64]:
get_single_prediction("i wish i will have to do that again")

Negative )-:
probability of 0.12742596864700317


In [49]:
get_single_prediction("at the end of this course i have to defend my project")

Negative )-:
probability of 0.09978639334440231
