**1. Preprocess**

---



1.1. Import libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


1.2. Read data

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

df = pd.read_csv('/content/drive/MyDrive/sentiment140.csv', encoding='latin1')
df[:10]

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by ...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Man...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all....",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
5,@Kwesidei not the whole crew,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,0,NO_QUERY
6,Need a hug,Mon Apr 06 22:20:03 PDT 2009,mybirch,0,NO_QUERY
7,@LOLTrish hey long time no see! Yes.. Rains a...,Mon Apr 06 22:20:03 PDT 2009,coZZ,0,NO_QUERY
8,@Tatiana_K nope they didn't have it,Mon Apr 06 22:20:05 PDT 2009,2Hood4Hollywood,0,NO_QUERY
9,@twittera que me muera ?,Mon Apr 06 22:20:09 PDT 2009,mimismo,0,NO_QUERY


1.3. Convert labels

In [None]:
df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})
df[:10]

Unnamed: 0,text,date,user,sentiment,query
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he can't update his Facebook by ...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,@Kenichan I dived many times for the ball. Man...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"@nationwideclass no, it's not behaving at all....",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
5,@Kwesidei not the whole crew,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,0,NO_QUERY
6,Need a hug,Mon Apr 06 22:20:03 PDT 2009,mybirch,0,NO_QUERY
7,@LOLTrish hey long time no see! Yes.. Rains a...,Mon Apr 06 22:20:03 PDT 2009,coZZ,0,NO_QUERY
8,@Tatiana_K nope they didn't have it,Mon Apr 06 22:20:05 PDT 2009,2Hood4Hollywood,0,NO_QUERY
9,@twittera que me muera ?,Mon Apr 06 22:20:09 PDT 2009,mimismo,0,NO_QUERY


1.4. Replace URLs, mentions, hashtags and punctuations

In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+', 'URL', text)
    text = re.sub(r'@\w+', 'MENTION', text)
    text = re.sub(r'#\w+', 'HASHTAG', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['text'] = df['text'].apply(preprocess_text)
df[:10]

Unnamed: 0,text,date,user,sentiment,query
0,MENTION URL Awww thats a bummer You shoulda ...,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,is upset that he cant update his Facebook by t...,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,MENTION I dived many times for the ball Manage...,Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,my whole body feels itchy and like its on fire,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,MENTION no its not behaving at all im mad why ...,Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
5,MENTION not the whole crew,Mon Apr 06 22:20:00 PDT 2009,joy_wolf,0,NO_QUERY
6,Need a hug,Mon Apr 06 22:20:03 PDT 2009,mybirch,0,NO_QUERY
7,MENTION hey long time no see Yes Rains a bit ...,Mon Apr 06 22:20:03 PDT 2009,coZZ,0,NO_QUERY
8,MENTION nope they didnt have it,Mon Apr 06 22:20:05 PDT 2009,2Hood4Hollywood,0,NO_QUERY
9,MENTION que me muera,Mon Apr 06 22:20:09 PDT 2009,mimismo,0,NO_QUERY


1.5. Tokenize

In [5]:
df['text'] = df['text'].apply(word_tokenize)
df[:10]

Unnamed: 0,text,date,user,sentiment,query
0,"[MENTION, URL, Awww, thats, a, bummer, You, sh...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,"[is, upset, that, he, cant, update, his, Faceb...",Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,"[MENTION, I, dived, many, times, for, the, bal...",Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,"[my, whole, body, feels, itchy, and, like, its...",Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"[MENTION, no, its, not, behaving, at, all, im,...",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
5,"[MENTION, not, the, whole, crew]",Mon Apr 06 22:20:00 PDT 2009,joy_wolf,0,NO_QUERY
6,"[Need, a, hug]",Mon Apr 06 22:20:03 PDT 2009,mybirch,0,NO_QUERY
7,"[MENTION, hey, long, time, no, see, Yes, Rains...",Mon Apr 06 22:20:03 PDT 2009,coZZ,0,NO_QUERY
8,"[MENTION, nope, they, didnt, have, it]",Mon Apr 06 22:20:05 PDT 2009,2Hood4Hollywood,0,NO_QUERY
9,"[MENTION, que, me, muera]",Mon Apr 06 22:20:09 PDT 2009,mimismo,0,NO_QUERY


1.6. Lemmatize

In [6]:
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

1.7. Display first 20 tweets

In [7]:
df.head(20)

Unnamed: 0,text,date,user,sentiment,query
0,"[MENTION, URL, Awww, thats, a, bummer, You, sh...",Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,0,NO_QUERY
1,"[is, upset, that, he, cant, update, his, Faceb...",Mon Apr 06 22:19:49 PDT 2009,scotthamilton,0,NO_QUERY
2,"[MENTION, I, dived, many, time, for, the, ball...",Mon Apr 06 22:19:53 PDT 2009,mattycus,0,NO_QUERY
3,"[my, whole, body, feel, itchy, and, like, it, ...",Mon Apr 06 22:19:57 PDT 2009,ElleCTF,0,NO_QUERY
4,"[MENTION, no, it, not, behaving, at, all, im, ...",Mon Apr 06 22:19:57 PDT 2009,Karoli,0,NO_QUERY
5,"[MENTION, not, the, whole, crew]",Mon Apr 06 22:20:00 PDT 2009,joy_wolf,0,NO_QUERY
6,"[Need, a, hug]",Mon Apr 06 22:20:03 PDT 2009,mybirch,0,NO_QUERY
7,"[MENTION, hey, long, time, no, see, Yes, Rains...",Mon Apr 06 22:20:03 PDT 2009,coZZ,0,NO_QUERY
8,"[MENTION, nope, they, didnt, have, it]",Mon Apr 06 22:20:05 PDT 2009,2Hood4Hollywood,0,NO_QUERY
9,"[MENTION, que, me, muera]",Mon Apr 06 22:20:09 PDT 2009,mimismo,0,NO_QUERY


1.8. Split data

In [8]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
print("Training data size:", len(train_data))
print("Test data size:", len(test_data))

Training data size: 1280000
Test data size: 320000


**2. Vectorization**

---



2.1. Assign num to words

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

2.2. Replace words with num

In [11]:
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

2.3. Pad sequence

In [12]:
max_sequence_len = max([len(seq) for seq in train_sequences])
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_len, padding='post')

**3. Word embeding**

---



3.1. Load word2vec

In [13]:
import gensim.downloader as api
from gensim.models import Word2Vec
w2v = api.load('word2vec-google-news-300')



3.2. Create embeding matrix

In [14]:
embedding_dim = 300
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in w2v:
        embedding_matrix[i] = w2v[word]

**4. Create model**

---



4.1. Import libraries

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

4.2. Define model

In [16]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_len, weights=[embedding_matrix], trainable=False))
model.add(SimpleRNN(64, return_sequences=False))
model.add(Dense(1, activation='linear'))

4.3. Set Optimizer and loss

In [17]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

4.4. Train model

In [19]:
batch_size = 512
epochs = 5
history = model.fit(train_padded, train_data['sentiment'], batch_size=batch_size, epochs=epochs, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


4.5. Test model

In [20]:
loss, accuracy = model.evaluate(test_padded, test_data['sentiment'])
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')

Test Loss: 0.6723041534423828
Test Accuracy: 0.599428117275238


4.6. Report precision, recall and f1-score

In [21]:
from sklearn.metrics import classification_report

y_pred = (model.predict(test_padded) > 0.5).astype("int32")
print(classification_report(test_data['sentiment'], y_pred))

              precision    recall  f1-score   support

           0       0.61      0.56      0.58    159494
           1       0.59      0.64      0.61    160506

    accuracy                           0.60    320000
   macro avg       0.60      0.60      0.60    320000
weighted avg       0.60      0.60      0.60    320000

