<a href="https://www.kaggle.com/code/florianpierre/sentiment-classification-with-glove-embeddings?scriptVersionId=95413636" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [None]:
# Read dataset as dataframe
path = '../input/sentiment140/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(path, encoding = 'Latin-1', names=('target','id','date','flag','username','tweet'))

In [None]:
df.head(10)

In [None]:
# Remove usernames and urls

import re

def clean_text(text):
    username = re.compile('@([a-zA-Z0-9]+)')
    url = re.compile('http://([a-zA-Z0-9/\.]+)|https://([a-zA-Z0-9/\.]+)')
    text = re.sub(username,'',text)
    text = re.sub(url,'',text)
    return text

In [None]:
df['clean_text'] = df['tweet'].apply(clean_text)
df['sentiment'] = df['target'].apply(lambda x: x if x==0 else 1)

In [None]:
# Shuffle dataset

df = df.sample(frac=1)

In [None]:
# extract values and targets

TRAIN_SIZE = 100000

texts = df['clean_text'].values[:TRAIN_SIZE]
labels = df['sentiment'].values[:TRAIN_SIZE]
print(texts.shape)
print(labels.shape)

In [None]:
# Build tokenizer anf fit on training corpus
VOCABULARY_SIZE = 10000
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=VOCABULARY_SIZE)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
corpus_vocabulary_size = len(word_index)
print(corpus_vocabulary_size)

In [None]:
for key in list(word_index.keys())[:5]:
    print(key,word_index[key])

In [None]:
reverse_word_index = {}
for key,value in word_index.items():
    reverse_word_index[value] = key

In [None]:
# Get GloVe embeddings
!wget 'https://nlp.stanford.edu/data/glove.6B.zip'
!unzip ./glove.6B.zip

In [None]:
# Build embeddings matrix

embeddings_path = './glove.6B.100d.txt'
EMBEDDINGS_DIM = 100

embeddings_matrix = np.zeros((corpus_vocabulary_size+1,EMBEDDINGS_DIM))

with open(embeddings_path) as f:
    for line in f:
        split_line = line.split()
        word = split_line[0]
        if word in word_index:
            index = word_index[word]
            embeddings_matrix[index] = np.array([float(x) for x in split_line[1:]])

In [None]:
print(embeddings_matrix)
print(embeddings_matrix.shape)
print(embeddings_matrix[0])
print(embeddings_matrix[58762])

In [None]:
# Tokenize and pad input texts

MAX_LENGTH = 20
 
tokens = tokenizer.texts_to_sequences(texts)
padded = tf.keras.preprocessing.sequence.pad_sequences(tokens,maxlen=MAX_LENGTH)
print(padded)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(corpus_vocabulary_size+1,EMBEDDINGS_DIM,input_length=MAX_LENGTH,weights=[embeddings_matrix],trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [None]:
tf.keras.backend.clear_session()
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
EPOCHS = 10

history = model.fit(padded,labels,epochs=EPOCHS,validation_split=0.2)

In [None]:
# Build test data
TEST_SIZE = 10000
test_texts = df['clean_text'].values[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE]
test_labels = df['sentiment'].values[TRAIN_SIZE:TRAIN_SIZE+TEST_SIZE]
print(test_texts.shape)
print(test_labels.shape)

# Tokenize and pad test texts
test_tokens = tokenizer.texts_to_sequences(test_texts)
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_tokens,maxlen=MAX_LENGTH)
print(test_padded)

In [None]:
# Evaluate model
model.evaluate(test_padded,test_labels)