In [10]:
import pandas as pd
import numpy as np

In [82]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses

### Dataset reading and annotations

In [37]:
dataset = pd.read_csv('Bhaav-Dataset.csv')

In [38]:
dataset

Unnamed: 0,Sentences,Annotation
0,रमजान के पूरे तीस रोजों के बाद ईद आयी है,1
1,"कितना मनोहर, कितना सुहावना प्रभाव है",1
2,"वृक्षों पर अजीब हरियाली है, खेतों में कुछ अजीब...",1
3,"आज का सूर्य देखो, कितना प्यारा, कितना शीतल है,...",1
4,गाँव में कितनी हलचल है,1
...,...,...
20299,फिर यहाँ सर्दी हो जाती है,4
20300,दिन-भर मैं यह देखती रहती हूँ कि धूप का टुकड़ा क...,4
20301,"पार्क का कोई ऐसा कोना नहीं, जहाँ मैं घड़ी-आधा घ...",4
20302,लेकिन यह बेंच मुझे सबसे अच्छी लगती है,1


In [39]:
dataset['Annotation'].value_counts()

4    11697
2     3168
1     2463
3     1512
0     1464
Name: Annotation, dtype: int64

In [40]:
dataset['Emotion'] = np.where(
    dataset['Annotation'] == 0, 'Anger', np.where(
        dataset['Annotation'] == 1, 'Joy', np.where(
            dataset['Annotation'] == 2, 'Sad', np.where(
                dataset['Annotation'] == 3, 'Suspense', 'Neutral'))))

In [41]:
dataset['Emotion'].value_counts()

Neutral     11697
Sad          3168
Joy          2463
Suspense     1512
Anger        1464
Name: Emotion, dtype: int64

### Convert to TF Dataset and Train/Test Split

In [47]:
full_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(dataset['Sentences'].values, tf.string),
            tf.cast(dataset['Annotation'].values, tf.int32)
        )
    )
)

In [52]:
train_size = int(dataset.shape[0]*0.8)

In [54]:
train_dataset = full_dataset.take(train_size)
test_dataset = full_dataset.skip(train_size)

In [89]:
batch_size = 16

In [90]:
train_dataset = train_dataset.batch(batch_size)
test_dataset = test_dataset.batch(batch_size)

### Preprocessing
* Tokenisation
* Punctuation Removal
* Embed words to ints

In [83]:
max_features = 10000
sequence_length = 100

vectorize_layer = layers.TextVectorization(
    standardize="strip_punctuation",
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [84]:
train_text = train_dataset.map(lambda x, y: x)

In [87]:
vectorize_layer.adapt(train_text)

In [88]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [113]:
text_batch, label_batch = next(iter(train_dataset))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", first_label)
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'\xe0\xa4\xb0\xe0\xa4\xae\xe0\xa4\x9c\xe0\xa4\xbe\xe0\xa4\xa8 \xe0\xa4\x95\xe0\xa5\x87 \xe0\xa4\xaa\xe0\xa5\x82\xe0\xa4\xb0\xe0\xa5\x87 \xe0\xa4\xa4\xe0\xa5\x80\xe0\xa4\xb8 \xe0\xa4\xb0\xe0\xa5\x8b\xe0\xa4\x9c\xe0\xa5\x8b\xe0\xa4\x82 \xe0\xa4\x95\xe0\xa5\x87 \xe0\xa4\xac\xe0\xa4\xbe\xe0\xa4\xa6 \xe0\xa4\x88\xe0\xa4\xa6 \xe0\xa4\x86\xe0\xa4\xaf\xe0\xa5\x80 \xe0\xa4\xb9\xe0\xa5\x88', shape=(), dtype=string)
Label tf.Tensor(1, shape=(), dtype=int32)
Vectorized review (<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[   1,    2,  839, 2833,    1,    2,   78, 2223,  320,    7,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 

In [99]:
train_ds = train_dataset.map(vectorize_text)
val_ds = test_dataset.map(vectorize_text)

In [100]:
embedding_dim = 64

### Model definition

In [129]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(5, activation='softmax')])

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, None, 64)          640064    
                                                                 
 dropout_10 (Dropout)        (None, None, 64)          0         
                                                                 
 global_average_pooling1d_5   (None, 64)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_11 (Dropout)        (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 5)                 325       
                                                                 
Total params: 640,389
Trainable params: 640,389
Non-trainable params: 0
________________________________________________

In [130]:
model.compile(loss=losses.SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.SparseCategoricalAccuracy())

In [132]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [136]:
val_softmax = model.predict(val_ds)

In [137]:
val_labels = np.argmax(val_softmax, axis=1)

In [141]:
val_pd = dataset[train_size:]

In [142]:
val_pd['pred_labels'] = val_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_pd['pred_labels'] = val_labels


In [143]:
val_pd

Unnamed: 0,Sentences,Annotation,Emotion,pred_labels
16243,हमारे पास एक मोटा ताजा ऊंट है और हम भूखे मर रह...,2,Sad,4
16244,"चीते ने ठंडी सांस भरी, 'क्या करें",4,Neutral,4
16245,शेर ने उसे अभयदान जो दे रखा है,4,Neutral,4
16246,देखो तो ऊंट की पीठ का कूबड़ कितना बड़ा हो गया है,4,Neutral,4
16247,चर्बी ही चर्बी भरी है इसमें,4,Neutral,4
...,...,...,...,...
20299,फिर यहाँ सर्दी हो जाती है,4,Neutral,4
20300,दिन-भर मैं यह देखती रहती हूँ कि धूप का टुकड़ा क...,4,Neutral,4
20301,"पार्क का कोई ऐसा कोना नहीं, जहाँ मैं घड़ी-आधा घ...",4,Neutral,4
20302,लेकिन यह बेंच मुझे सबसे अच्छी लगती है,1,Joy,4


In [144]:
val_pd['pred_labels'].value_counts()

4    3999
2      47
1      11
0       2
3       2
Name: pred_labels, dtype: int64

In [145]:
val_pd['Annotation'].value_counts()

4    2699
2     625
1     416
0     183
3     138
Name: Annotation, dtype: int64

### LSTM

In [151]:
model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim, mask_zero=True),
    tf.keras.layers.LSTM(32),
    layers.Dense(16, activation='relu'),
    layers.Dense(5)
])

In [152]:
model.compile(loss=losses.SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=tf.metrics.SparseCategoricalAccuracy())

In [153]:
epochs = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
