<a href="https://colab.research.google.com/github/faizanurrahman/temp_data/blob/master/Sentiment_Analysis_DEEP_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
#import some library
import numpy as np
import pandas as pd
np.random.seed(0)
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)


Using TensorFlow backend.


In [0]:
# read csv file
df = pd.read_csv('/content/gdrive/My Drive/wordtovec_dataset/Tweets/Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [0]:
df.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [0]:
# feature extract
df = df[['text', 'airline_sentiment']].copy()
df.columns = ['Tweet', 'Label']
df.head()

Unnamed: 0,Tweet,Label
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [0]:
def Label_encode(x):
  if x == 'neutral':
    x = 0
  elif x == 'positive':
    x = 1
  elif x == 'negative':
    x = 2
  return x
df['Tweet'] = df['Tweet'].str.replace('[^\w\s]','')
df['Label'] = df['Label'].apply(lambda x: Label_encode(x))
df.head()

Unnamed: 0,Tweet,Label
0,VirginAmerica What dhepburn said,0
1,VirginAmerica plus youve added commercials to ...,1
2,VirginAmerica I didnt today Must mean I need t...,0
3,VirginAmerica its really aggressive to blast o...,2
4,VirginAmerica and its a really big bad thing a...,2


In [0]:
df.to_csv('hotel_rev_data.csv')

In [0]:
# splitting dataset into train and validation set.
msk = np.random.rand(len(df)) < 0.8
train = df[msk]

test = df[~msk]

print('train shape: '+str(train.shape))
print('test shape: '+ str(test.shape))
X_train = np.asarray(train['Tweet'])
Y_train = np.asarray(train['Label'])
X_test = np.asarray(test['Tweet'])
Y_test = np.asarray(test['Label'])

train shape: (11716, 2)
test shape: (2924, 2)


In [0]:
#reading glove file
from pathlib import Path
glove_folder = Path('/content/gdrive/My Drive/wordtovec_dataset/glove.6B')
glove_file = glove_folder / 'glove.6B.50d.txt'

with open(glove_file, 'r') as glovefile:
  word = set()
  word_to_vec_map = {}
  for line in glovefile:
    line = line.strip().split()
    curr_word = line[0]
    word.add(curr_word)
    word_to_vec_map[curr_word] = [float(x) for x in line[1:]]
  
  i = 1
  index_to_word = {}
  word_to_index = {}
  for w in sorted(word):
    word_to_index[w] = i
    index_to_word[i] = w
    i = i + 1

In [0]:
# max input length to model.
max_len = max(map(lambda x: len(x), df.Tweet.str.split()))
print(max_len)

35


In [0]:
# convert target to one-hot encoding
def one_hot(Y, C):
  Y = np.eye(C)[Y.reshape(-1)]
  return Y

# sentance to indices
def sentance_to_index(X, word_to_index, max_len):
  m = X.shape[0]
  X_indices = np.zeros((m, max_len))
  for i in range(m):
    word = X[i].lower().split()
    j = 0
    for w in word:
      X_indices[i, j] = word_to_index.get(w,0)
      j = j + 1
  return X_indices

# checking sentance_to_index
X = np.array(['hello i am faizanur rahman'])
X_ind = sentance_to_index(X, word_to_index, 5)
X_ind

array([[176469., 185458.,  52944.,      0., 299298.]])

In [0]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
  vocab_len = len(word_to_index) + 1
  emb_dim = len(word_to_vec_map['cucumber'])
  emb_matrix = np.zeros((vocab_len, emb_dim))
  for word, index in word_to_index.items():
    emb_matrix[index, :] = word_to_vec_map[word]
  embedding_layer = Embedding(input_dim=vocab_len, output_dim=emb_dim, trainable=False)
  embedding_layer.build((None, ))
  embedding_layer.set_weights([emb_matrix])
  return embedding_layer


In [0]:
def sentiment_model(input_shape, word_to_vec_map, word_to_index):
  
  sentence_indices = Input(shape=input_shape, dtype=np.int32)

  embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
  embeddings = embedding_layer(sentence_indices)   
  X = LSTM(128, return_sequences=True)(embeddings)
  X = Dropout(rate = 0.5)(X)
  X = LSTM(128, return_sequences=True)(X)
  X = Dropout(rate = 0.5)(X)
  X = LSTM(64, return_sequences=False)(X)
  X = Dropout(rate = 0.5)(X)
  X = Dense(activation='softmax', units=3)(X)
  X = Activation('softmax')(X)

  # Create Model instance which converts sentence_indices into X.
  model = Model(inputs=sentence_indices, outputs=X)


  return model

#creating sentment_model instance
model = sentiment_model((max_len,), word_to_vec_map, word_to_index)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 35)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 35, 50)            20000100  
_________________________________________________________________
lstm_1 (LSTM)                (None, 35, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 35, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 35, 128)           131584    
_________________________________________________________________
dropout_2 (Dropout)  

In [0]:
model.compile(loss='categorical_crossentropy', optimizer='RMSprop', metrics=['accuracy'])
X_train_indices = sentance_to_index(X_train, word_to_index, max_len)
Y_train_oh = one_hot(Y_train, C = 3)


In [0]:

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
X_train_indices = sentance_to_index(X_train, word_to_index, max_len)
Y_train_oh = one_hot(Y_train, C = 3)
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 64, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f9169be55c0>

In [0]:
model.save_weights("Hotel_reviews_weights.h5")
# load weights from file (can call without model.fit)
#model.load_weights("/content/gdrive/My Drive/twitter_sentiment_weights.h5")
X_test_indices = sentance_to_index(X_test, word_to_index, max_len = max_len)
Y_test_oh = one_hot(Y_test, C = 3)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print()
print("Test accuracy = ", acc)


Test accuracy =  0.7633378933783921


In [0]:
!pip install emoji
import emoji
emoji_dictionary = {"1": ":smile:",    # :heart
                    "0": ":thumbsup:",
                    "2": ":disappointed:",
                    }

def label_to_emoji(label):
    return emoji.emojize(emoji_dictionary.get(str(label), emoji_dictionary['2']), use_aliases=True)
              



In [0]:
# test on custom sentence.
x_test = np.array(['i adore you'])
X_test_indices = sentance_to_index(x_test, word_to_index, max_len)
print(x_test[0] +' '+  label_to_emoji(np.argmax(model.predict(X_test_indices))))

i adore you 😄
