In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [3]:
glove_dir = os.path.join(os.getcwd(),'gdrive','My Drive','Colab Notebooks','models')

glove_file = open(os.path.join(glove_dir,'glove.twitter.27B.100d.txt'))

glove_index = {}

for line in glove_file:
  array = line.split()
  word = array[0]
  vector = np.asarray(array[1:], dtype='float32')
  glove_index[word] = vector

glove_file.close()

In [4]:
data_dir = os.path.join(os.getcwd(),'gdrive','My Drive','TwitterData')

train_path = os.path.join(data_dir,'clean_twitter_sentiment_train.csv')

test_path = os.path.join(data_dir,'clean_twitter_sentiment_test.csv')




train = pd.read_csv(train_path)
train = train.dropna()
train.head()

Unnamed: 0,label,tweet_id,text,clean_text,text2
0,negative,1467810672,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,be upset that he can't update he Facebook by t...
1,negative,1467810917,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...,I dive many time for the ball. Managed to save...
2,negative,1467811184,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,i whole body feel itchy and like it on fire
3,negative,1467811193,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...","no, it's not behave at all. i'm mad. why be i ..."
4,negative,1467811372,@Kwesidei not the whole crew,not the whole crew,not the whole crew


In [5]:
test = pd.read_csv(test_path)
test = test.dropna()
test.head()

Unnamed: 0,label,tweet_id,text,clean_text,text2
0,positive,1957713300,@IdeasCulture Brochure looks great Yvonne.,Brochure looks great Yvonne.,Brochure look great Yvonne.
1,positive,1993603045,I THINK I NEED A BREAK...my butt is startin 2 ...,I THINK I NEED A BREAK...my butt is startin 2 ...,ITHINK I NEED A BREAK...my butt be startin 2 h...
2,negative,2257868396,I hate it when my days are turned upside down....,I hate it when my days are turned upside down....,Ihate it when i day be turn upside down.. It's...
3,negative,2195924253,i can't wait for the Beautiful video!!!!!!!! I...,i can't wait for the Beautiful video!!!!!!!! I...,ican't wait for the Beautiful video!!!!!!!! It...
4,positive,1972935776,pool again...with sushi this time,pool again...with sushi this time,pool again...with sushi this time


In [6]:
X_train = train['text2']
y_train = []

for i in train['label'].tolist():
  if i == 'negative':
    y_train.append(0)
  else:
    y_train.append(1)

y_train = np.array(y_train)

In [7]:
X_test = test['text2']
y_test = []

for i in test['label'].tolist():
  if i == 'negative':
    y_test.append(0)
  else:
    y_test.append(1)

y_test = np.array(y_test)

In [8]:
from keras.backend import clear_session 
#clear_session()

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.backend import clear_session

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

print(type(word_index))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

maxlen = 280

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

<class 'dict'>


In [10]:
word_embedding_matrix = np.random.random((len(word_index) + 1, 100))

for word, index in word_index.items():
  #word_embedding_vector = glove_index[word]
  #if word_embedding_vector is not None:
  if word in glove_index.keys():
    word_embedding_matrix[index] = glove_index[word]

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.metrics import AUC
from tensorflow.keras.backend import clear_session

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()

model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights = [word_embedding_matrix], input_length=maxlen, trainable=True))
model.add(layers.Conv1D(128, 5, activation='relu',kernel_initializer="he_normal"))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu',kernel_initializer="he_normal"))
model.add(layers.MaxPooling1D(35))
model.add(layers.Flatten())
model.add(layers.Dense(128, activation='relu',kernel_initializer="he_normal"))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 280, 100)          24798300  
_________________________________________________________________
conv1d (Conv1D)              (None, 276, 128)          64128     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 55, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 51, 128)           82048     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
flatten (Flatten)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               1

In [12]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy', metrics=[AUC(from_logits=True)])

history = model.fit(X_train, y_train, epochs=9, verbose=1, batch_size=32)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


In [13]:
model.evaluate(X_test,y_test)



[0.42463377118110657, 0.8928125500679016]

In [14]:
model_file = os.path.join(glove_dir,"final-twitter-covnet.hd5")
model.save(model_file)

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/Colab Notebooks/models/final-twitter-covnet.hd5/assets


In [15]:
from tensorflow import keras

loaded_model = keras.models.load_model(model_file)

In [18]:
X_test[0].shape

(280,)

In [19]:
temp = X_test[0]

In [21]:
temp = temp.reshape(1,280)

In [22]:
temp.shape

(1, 280)

In [23]:
loaded_model.predict(temp)

array([[0.9038533]], dtype=float32)

In [24]:
y_test[0]

1