In [1]:
import tensorflow_text
import tensorflow_hub as hub

# Loading a transformer from Tensorflow Hub
# We are using the Universal-sentence-encoder-multilingual-large Transformer
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [2]:
import pandas as pd

# Importing the cleaned data
df = pd.read_csv('./data/proj/clean_dataset.csv')
df = df.dropna()
display(df)

Unnamed: 0.1,Unnamed: 0,tweet_id,sentiment,content
0,3,1956967789,positive,want hang friend soon
1,14,1956970860,positive,get news
2,21,1956972097,positive,"wonder I be awake 7am , write new song , plot ..."
3,41,1956977187,positive,<3 gon na twitter ;) cause amazing lol . com...
4,56,1956981427,positive,"bed ... sorta . today good , sara strep think ..."
...,...,...,...,...
37642,39974,1753904674,love,pretty lady happy mother be day ! ! ! she be...
37643,39989,1753918822,love,"snore annoying n keep sleep ( like right now ,..."
37644,39996,1753919001,love,happy mother day love
37645,39997,1753919005,love,"happy mother be day mommy there , woman man lo..."


In [3]:
df['sentiment'].unique()

array(['positive', 'negative', 'neutral', 'worry', 'happiness', 'sadness',
       'love'], dtype=object)

In [4]:
#dropping the nutral label
df = df.loc[ df['sentiment'] != 'neutral' ]
display(df)

Unnamed: 0.1,Unnamed: 0,tweet_id,sentiment,content
0,3,1956967789,positive,want hang friend soon
1,14,1956970860,positive,get news
2,21,1956972097,positive,"wonder I be awake 7am , write new song , plot ..."
3,41,1956977187,positive,<3 gon na twitter ;) cause amazing lol . com...
4,56,1956981427,positive,"bed ... sorta . today good , sara strep think ..."
...,...,...,...,...
37642,39974,1753904674,love,pretty lady happy mother be day ! ! ! she be...
37643,39989,1753918822,love,"snore annoying n keep sleep ( like right now ,..."
37644,39996,1753919001,love,happy mother day love
37645,39997,1753919005,love,"happy mother be day mommy there , woman man lo..."


In [5]:
# Replacing the labels with integers

from sklearn.preprocessing import LabelEncoder

col = 'sentiment'
le = LabelEncoder();


df[col] = le.fit_transform(df[col])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Our Mapped Labels ", le_name_mapping)

df = df.dropna()
display(df)


Our Mapped Labels  {'happiness': 0, 'love': 1, 'negative': 2, 'positive': 3, 'sadness': 4, 'worry': 5}


Unnamed: 0.1,Unnamed: 0,tweet_id,sentiment,content
0,3,1956967789,3,want hang friend soon
1,14,1956970860,3,get news
2,21,1956972097,3,"wonder I be awake 7am , write new song , plot ..."
3,41,1956977187,3,<3 gon na twitter ;) cause amazing lol . com...
4,56,1956981427,3,"bed ... sorta . today good , sara strep think ..."
...,...,...,...,...
37642,39974,1753904674,1,pretty lady happy mother be day ! ! ! she be...
37643,39989,1753918822,1,"snore annoying n keep sleep ( like right now ,..."
37644,39996,1753919001,1,happy mother day love
37645,39997,1753919005,1,"happy mother be day mommy there , woman man lo..."


In [6]:
# in tensorflow, the output data must be mapped into arrays so
# one-hot encode the lables

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)

type_one_hot = enc.fit_transform(
  df.sentiment.to_numpy().reshape(-1, 1)
)

display(type_one_hot)


array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       ...,
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [11]:
# Splitting the dataset into, Train and Test sets


from sklearn.model_selection import train_test_split


train_text, test_text, y_train, y_test =\
  train_test_split(
    df.content,
    type_one_hot,
    test_size=.1,
    random_state=526
  )

In [12]:
display(y_train)

array([[0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0.]])

In [13]:
from tqdm import tqdm
import tensorflow as tf
import numpy as np

# Extracting features from the training text using the transformer and saving it to the new array
X_train = []

for r in tqdm(train_text):

    emb = use(r)

    train_emb = tf.reshape(emb, [-1]).numpy()
    X_train.append(train_emb)
X_train = np.array(X_train)



100%|██████████| 26055/26055 [13:27<00:00, 32.28it/s]


In [14]:

# Extracting features from the test text using the transformer and saving it to the new array

X_test = []
for r in tqdm(test_text):
    emb = use(r)
    test_emb = tf.reshape(emb, [-1]).numpy()
    X_test.append(test_emb)
X_test = np.array(X_test)


100%|██████████| 2895/2895 [01:32<00:00, 31.17it/s]


In [15]:
# Creating our NN
# with 3 Layers


import keras


model = keras.Sequential()
model.add(
  keras.layers.Dense(
    units=256,
    input_shape=(X_train.shape[1], ),
    activation='relu'
  )
)

model.add(
  keras.layers.Dropout(rate=0.2)
)

model.add(
  keras.layers.Dense(
    units=128,
    activation='relu'
  )
)


model.add(
  keras.layers.Dropout(rate=0.2)
)
model.add(keras.layers.Dense(6, activation='softmax'))
model.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(0.01),
    metrics=['accuracy']
)

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               131328    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 774       
Total params: 164,998
Trainable params: 164,998
Non-trainable params: 0
_________________________________________________________________


In [16]:
display(X_train.shape)
display(y_train.shape)

(26055, 512)

(26055, 6)

In [18]:
# Training the module for 50 cycles

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=16,
    validation_split=0.1,
    verbose=1,
    shuffle=True
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:
Y_pred = model.predict_classes(X_test,batch_size = 16)



In [23]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix


df_test = pd.DataFrame({'true': y_test.tolist(), 'pred':Y_pred})
df_test['true'] = df_test['true'].apply(lambda x: np.argmax(x))
print(classification_report(df_test.true, df_test.pred))

              precision    recall  f1-score   support

           0       0.34      0.55      0.42       519
           1       0.43      0.25      0.32       378
           2       0.24      0.08      0.12       158
           3       0.24      0.20      0.22       419
           4       0.32      0.33      0.33       546
           5       0.43      0.42      0.42       875

    accuracy                           0.36      2895
   macro avg       0.33      0.31      0.30      2895
weighted avg       0.35      0.36      0.34      2895



In [None]:
print(classification_report(df_test.true, df_test.pred))