**Dataset**

This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment .

Content
It contains the following 6 fields:

*   target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
*   ids: The id of the tweet ( 2087)
*   date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
*   flag: The query (lyx). If there is no query, then this value is NO_QUERY.
*   user: the user that tweeted (robotickilldozr)
*   text: the text of the tweet (Lyx is cool)

In [1]:
import json
import tensorflow as tf
import csv
import random
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

embedding_dim = 100
max_length = 16
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 160000 #Your dataset size here. Experiment using smaller values (i.e. 16000), but don't forget to train on at least 160000 to see the best effects
test_portion=.1

corpus = []

!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv \
    -O /tmp/training_cleaned.csv

--2020-08-06 21:01:55--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.218.128, 173.194.69.128, 108.177.126.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.218.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 238942690 (228M) [application/octet-stream]
Saving to: ‘/tmp/training_cleaned.csv’


2020-08-06 21:02:01 (45.5 MB/s) - ‘/tmp/training_cleaned.csv’ saved [238942690/238942690]



In [5]:
num_sentences = 0
with open("/tmp/training_cleaned.csv") as csvfile:
  reader = csv.reader(csvfile, delimiter=',')
  for row in reader:
      list_item=['','']
      list_item[0] = row[5]
      list_item[1] = row[0]
      num_sentences = num_sentences + 1
      corpus.append(list_item)

print(num_sentences)
print(len(corpus))
print(corpus[20])
corpus[:3]

1600000
4800000
["@alydesigns i was out most of the day so didn't get much done ", '0']


[["@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D",
  '0'],
 ["is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!",
  '0'],
 ['@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds',
  '0']]

In [6]:
sentences=[]
labels=[]
random.shuffle(corpus)
#Label 0 or 1
for x in range(training_size):
    #print(corpus[x])
    sentences.append(corpus[x][0])
    label = 1
    if corpus[x][1] == '0':
      label = 0
    labels.append(label)

tokenizer = Tokenizer() #num_words = vocab_size, oov_token=oov_tok
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size=len(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

split = int(test_portion * training_size)

test_sequences = np.asarray(padded[:split])
training_sequences = np.asarray(padded[split:training_size])
test_labels = np.asarray(labels[:split])
training_labels = np.asarray(labels[split:training_size])

In [9]:
print(set(labels))
print(vocab_size)
print(word_index['i'])

{0, 1}
134605
1


In [10]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length),
    # YOUR CODE HERE - experiment with combining different types, such as convolutions and LSTMs
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
num_epochs = 10
history = model.fit(training_sequences, training_labels, 
                    epochs=num_epochs, 
                    validation_data=(test_sequences, test_labels))
plot_accuracy_loss(history)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 100)           13460600  
_________________________________________________________________
global_average_pooling1d (Gl (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 24)                2424      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 13,463,049
Trainable params: 13,463,049
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


NameError: ignored

# TXT file

In [None]:
# Note this is the 100 dimension version of GloVe from Stanford
# I unzipped and hosted it on my site to make this notebook easier
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt

In [None]:
embeddings_index = {};

with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

print(len(embeddings_matrix))
# Expected Output
# 138859
print(len(set(training_labels)))

In [None]:
import matplotlib.image  as mpimg
import matplotlib.pyplot as plt

def plot_accuracy_loss(history):
  #-----------------------------------------------------------
  # Retrieve a list of list results on training and test data
  # sets for each training epoch
  #-----------------------------------------------------------
  acc=history.history['accuracy']
  val_acc=history.history['val_accuracy']
  loss=history.history['loss']
  val_loss=history.history['val_loss']

  epochs=range(len(acc)) # Get number of epochs

  #------------------------------------------------
  # Plot training and validation accuracy per epoch
  #------------------------------------------------
  plt.plot(epochs, acc, 'b')
  plt.plot(epochs, val_acc, 'r')
  plt.title('Training and validation accuracy')
  plt.xlabel("Epochs")
  plt.ylabel("Accuracy")
  plt.legend(["Accuracy", "Validation Accuracy"])

  plt.figure();

  #------------------------------------------------
  # Plot training and validation loss per epoch
  #------------------------------------------------
  plt.plot(epochs, loss, 'b')
  plt.plot(epochs, val_loss, 'r')
  plt.title('Training and validation loss')
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.legend(["Loss", "Validation Loss"])

  plt.figure();

# Word embedding from GloVe - Transfer Learning

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])