# Sentiment analysis of tweets - opinions on air carriers from February 2015

### Step 1: import of tweets data

In [2]:
import re
import numpy as np
import pandas as pd
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
%matplotlib inline

#limit on size of vocabualry dictionary used in tweets
vocab_size = 500

data = pd.read_csv('Tweets.csv')
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,airline,text
0,5.703061e+17,neutral,1.0,Virgin America,@VirginAmerica What @dhepburn said.
1,5.703011e+17,positive,0.3486,Virgin America,@VirginAmerica plus you've added commercials t...
2,5.703011e+17,neutral,0.6837,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,5.70301e+17,negative,1.0,Virgin America,@VirginAmerica it's really aggressive to blast...
4,5.703008e+17,negative,1.0,Virgin America,@VirginAmerica and it's a really big bad thing...


### Step 2: data transformation and splitting into training and test sets

In [3]:
#tokenizing of words in tweets
tok = Tokenizer(num_words=vocab_size, split=' ')
tok.fit_on_texts(data['text'].values)
X = tok.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

#extraction of sentiment category
categories = pd.get_dummies(data['airline_sentiment'])
labels = categories.keys()
Y = categories.values

#splitting into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=42)
print('train_features shape: ', X_train.shape)
print('test_features shape: ', X_test.shape)
print('train_labels shape: ', Y_train.shape)
print('test_labels shape: ', Y_test.shape)


train_features shape:  (13174, 30)
test_features shape:  (1464, 30)
train_labels shape:  (13174, 3)
test_labels shape:  (1464, 3)


### Step 3: Definition of neural network

In [None]:
#definition of model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.metrics import CategoricalAccuracy

# model = Sequential()
# exercise 1
# Add layers to the model:
# - Embedding - it should get vectors of dictionary size lenght (vocab_size) on the input and transform them into vectors of the lenght equal to 32
# - 1 LSTM layer with number of units equal to 10
# - Dens - a base of classification (how many outputs it should have?)

# - learning process should be based on function starty categorical_crossentropy
# - choose 'sgd' as a method for model optimization
# - model should return accuracy metric (categorical_accuracy)
# ------------------------------------------------------------------------

# exercise 2 - zamien
# Change model optimization method to 'adam'.
# Compare results with those obtained with 'sgd' and explain differences.
# ------------------------------------------------------------------------

# exercise 3
# Add additional LSTM layer with number of units equal to 10.
# Perform learning process with 'adam' and 'sgd' methods.


## -- beginning of your solution

def create_model(optimizer, two_layers=False):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=32, input_length=X.shape[1]))

    if two_layers:
        model.add(LSTM(units=10, return_sequences=True))

    model.add(LSTM(units=10))
    model.add(Dense(units=Y.shape[1], activation="softmax"))
    model.compile(optimizer=optimizer,
                  loss="categorical_crossentropy",
                  metrics=[CategoricalAccuracy()])
    return model

print("Training model with SGD optimizer...")
model_sgd = create_model(optimizer=SGD())
history_sgd = model_sgd.fit(X_train, Y_train, epochs=5, batch_size=32, validation_data=(X_test, Y_test))

print("Training model with Adam optimizer...")
model_adam = create_model(optimizer=Adam())
history_adam = model_adam.fit(X_train, Y_train, epochs=5, batch_size=32, validation_data=(X_test, Y_test))

print("Training model with additional LSTM layer and Adam optimizer...")
model_extra_adam = create_model(optimizer=Adam(), two_layers=True)
history_extra_adam = model_extra_adam.fit(X_train, Y_train, epochs=5, batch_size=32, validation_data=(X_test, Y_test))

print("Training model with additional LSTM layer and SGD optimizer...")
model_extra_sgd = create_model(optimizer=SGD(), two_layers=True)
history_extra_sgd = model_extra_sgd.fit(X_train, Y_train, epochs=5, batch_size=32, validation_data=(X_test, Y_test))

print("Model summary with SGD optimizer:")
print(model_sgd.summary())
print("Model summary with Adam optimizer:")
print(model_adam.summary())
print("Model summary with additional LSTM layer and Adam optimizer:")
print(model_extra_adam.summary())
print("Model summary with additional LSTM layer and SGD optimizer:")
print(model_extra_sgd.summary())

## -- end of your solution

# print(model.summary())


### Step 4: Learning process

In [15]:
# Add:
# - network learning on X_train, Y_train with parameters: batch_size = 16 and number of epochs = 5
# - accuracy checking on test data: X_test,Y_test

## -- beginning of your solution

batch_size = 16
epochs = 5

print("Training model with SGD optimizer...")
model_sgd = create_model(optimizer=SGD())
history_sgd = model_sgd.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, Y_test))

accuracy_sgd = model_sgd.evaluate(X_test, Y_test, verbose=0)
print(f"SGD Optimizer - Test Accuracy: {accuracy_sgd[1]:.4f}")

print("Training model with Adam optimizer...")
model_adam = create_model(optimizer=Adam())
history_adam = model_adam.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, Y_test))

accuracy_adam = model_adam.evaluate(X_test, Y_test, verbose=0)
print(f"Adam Optimizer - Test Accuracy: {accuracy_adam[1]:.4f}")

print("Training model with additional LSTM layer and SGD optimizer...")
model_extra_sgd = create_model(optimizer=SGD(), two_layers=True)
history_extra_sgd = model_extra_sgd.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, Y_test))

accuracy_extra_sgd = model_extra_sgd.evaluate(X_test, Y_test, verbose=0)
print(f"Extra LSTM Layer (SGD) - Test Accuracy: {accuracy_extra_sgd[1]:.4f}")

print("Training model with additional LSTM layer and Adam optimizer...")
model_extra_adam = create_model(optimizer=Adam(), two_layers=True)
history_extra_adam = model_extra_adam.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test, Y_test))

accuracy_extra_adam = model_extra_adam.evaluate(X_test, Y_test, verbose=0)
print(f"Extra LSTM Layer (Adam) - Test Accuracy: {accuracy_extra_adam[1]:.4f}")

## -- end of your solution


Training model with SGD optimizer...
Epoch 1/5
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - categorical_accuracy: 0.6218 - loss: 0.9656 - val_categorical_accuracy: 0.6045 - val_loss: 0.9327
Epoch 2/5
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - categorical_accuracy: 0.6297 - loss: 0.8977 - val_categorical_accuracy: 0.6113 - val_loss: 0.8689
Epoch 3/5
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - categorical_accuracy: 0.6393 - loss: 0.8427 - val_categorical_accuracy: 0.6325 - val_loss: 0.8393
Epoch 4/5
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - categorical_accuracy: 0.6514 - loss: 0.8200 - val_categorical_accuracy: 0.6332 - val_loss: 0.8272
Epoch 5/5
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - categorical_accuracy: 0.6537 - loss: 0.8083 - val_categorical_accuracy: 0.6434 - val_loss: 0.8121
SGD Optimizer - Test Accuracy: 0.6434


### Step 5: Sentiment prediction on exemplary tweets

In [20]:
# prediction on exemplary tweets
def predict(tweet, model):
    padded_tweet = pad_sequences(tok.texts_to_sequences([tweet]), maxlen=X.shape[1])
    scores = model.predict(padded_tweet)[0]
    index = np.argmax(scores)
    print(f'Tweet:\"{tweet}\"')
    print(f'predicted sentiment: {labels[index]}, confidence: {scores[index]}\n')

model=model_adam
#expected prediction: negative
predict("@united been up since 4am cheers for this delay and then cancellation of the flight", model)
#expected prediction: positive
predict("@united Terrific. Many thanks. Looking forward to being back on UA tomorrow. Had a great flight up to Vancouver.", model)
#expected prediction: neutral
predict("Dallas, Texas to Marrakesh, Morocco for only $442 roundtrip with @FlySWISS & @united.", model)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Tweet:"@united been up since 4am cheers for this delay and then cancellation of the flight"
predicted sentiment: negative, confidence: 0.972137987613678

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Tweet:"@united Terrific. Many thanks. Looking forward to being back on UA tomorrow. Had a great flight up to Vancouver."
predicted sentiment: positive, confidence: 0.9457683563232422

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Tweet:"Dallas, Texas to Marrakesh, Morocco for only $442 roundtrip with @FlySWISS & @united."
predicted sentiment: neutral, confidence: 0.6631367206573486

