# Sentiment Analysis
The sentiment analysis of the tweets. <br />
Training done on dataset on twitter at https://www.kaggle.com/c/tweet-sentiment-extraction

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('data/tweet-sentiment-extraction/train.csv')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [3]:
data = df.loc[:, ['text', 'sentiment']]
data.head()

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


# Using the VADER SentimentIntensityAnalyzer
Let's see the performance of Vader on scraped tweets.

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [5]:
sentiment_analyser = SentimentIntensityAnalyzer()

In [6]:
actual = []
pred = []
for row in data.iterrows():
    text = row[1]['text']
    
    sentiment = sentiment_analyser.polarity_scores(str(text))
    sentiment['compound'] = sentiment['compound']
    
    predicted = ''
    if sentiment['compound'] >= 0.05:
        predicted = 'positive'
    elif sentiment['compound'] <= -0.05:
        predicted = 'negative'
    else:
        predicted = 'neutral'
        
        
    actual.append(row[1]['sentiment'])
    pred.append(predicted)

In [7]:
actual = np.array(actual)
pred = np.array(pred)

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [9]:
cm = confusion_matrix(actual, pred)
print(cm)

[[4638 1426 1717]
 [1664 5245 4209]
 [ 340  779 7463]]


In [10]:
acc = accuracy_score(actual, pred)
print(acc)

0.6311997380008005


In [11]:
report = classification_report(actual, pred)
print(report)

              precision    recall  f1-score   support

    negative       0.70      0.60      0.64      7781
     neutral       0.70      0.47      0.56     11118
    positive       0.56      0.87      0.68      8582

    accuracy                           0.63     27481
   macro avg       0.65      0.65      0.63     27481
weighted avg       0.66      0.63      0.62     27481



# Using Deep Learning
Let's now train a deep learning model and see it's performance on scraped tweets.

In [12]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size = 0.1)

In [13]:
train.head()

Unnamed: 0,text,sentiment
10290,Ahh well he can only spend short amounts of t...,neutral
1307,Hey Charicee! How are u? Are you going to com...,neutral
4834,'Her`s before mine' hon. It`s one of few rule...,neutral
10727,Is youtube not working properly for anyone els...,neutral
3031,Friday!!!!!!!!!! Wooo and nothing to do. Nugg...,neutral


In [14]:
training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []

for row in train.iterrows():
    training_sentences.append(str(row[1]['text']))
    training_labels.append(row[1]['sentiment'])

for row in test.iterrows():
    testing_sentences.append(str(row[1]['text']))
    testing_labels.append(row[1]['sentiment'])

In [15]:
def encode_labels(labels):
    labels_final = np.zeros((len(labels), 3))
    for i in range(len(labels)):
        if labels[i] == 'negative':
            labels_final[i, 0] = 1
        elif labels[i] == 'neutral':
            labels_final[i, 1] = 1
        else:
            labels_final[i, 2] = 1
    
    return labels_final

In [16]:
training_labels_final = encode_labels(training_labels)
testing_labels_final = encode_labels(testing_labels)

In [17]:
vocab_size = 1000
embedding_dim = 16
max_length = 90
trunc_type = 'post'
oov_tok = "<oov>"

In [18]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences, maxlen = max_length, truncating = trunc_type)
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen = max_length, truncating = trunc_type)

In [19]:
model = tensorflow.keras.models.Sequential([
    tensorflow.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tensorflow.keras.layers.Bidirectional(tensorflow.keras.layers.LSTM(20)),
    tensorflow.keras.layers.Dense(10, activation = 'relu'),
    tensorflow.keras.layers.Dense(3, activation='softmax')
])

In [20]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 90, 16)            16000     
_________________________________________________________________
bidirectional (Bidirectional (None, 40)                5920      
_________________________________________________________________
dense (Dense)                (None, 10)                410       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 33        
Total params: 22,363
Trainable params: 22,363
Non-trainable params: 0
_________________________________________________________________


In [21]:
num_epochs = 10
history = model.fit(padded, training_labels_final, epochs = num_epochs, validation_data = (testing_padded, testing_labels_final))

Train on 24732 samples, validate on 2749 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


This trained model can be used to predict the sentiments of the scraped tweets.