In [5]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('wordnet')
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Input, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
data = pd.read_csv("/content/data.csv", names=["sentence","sentiment"])[1:]

In [7]:
data.head()

Unnamed: 0,sentence,sentiment
1,The GeoSolutions technology will leverage Bene...,positive
2,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
3,"For the last quarter of 2010 , Componenta 's n...",positive
4,According to the Finnish-Russian Chamber of Co...,neutral
5,The Swedish buyout firm has sold its remaining...,neutral


In [8]:
data.shape

(5842, 2)

In [9]:
data.drop_duplicates(subset=['sentence'], inplace=True)
data.dropna(axis=0, inplace=True)

In [10]:
data.replace('', np.nan, inplace=True)
data.dropna(axis=0, inplace=True)

In [11]:
data.shape

(5322, 2)

In [12]:
data['sentence'] = data['sentence'].apply(lambda x:x.lower())

In [13]:
data['sentence'] = data['sentence'].apply(lambda x: re.sub(r'"', "", x))

In [14]:
data['sentence'] = data['sentence'].apply(lambda x: re.sub(r'\([^)]*\)', "", x))

In [15]:
data['sentence'] = data['sentence'].apply(lambda x:re.sub("[^a-zA-Z]", " ", x))

In [16]:
data['sentence'] = data['sentence'].apply(lambda x: re.sub(r'\bs\b', "", x))

In [17]:
stop_words = stopwords.words('english')
data['sentence'] = data['sentence'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))

In [18]:
stemmer = PorterStemmer()
data['sentence'] = data['sentence'].apply(lambda x: ''.join(stemmer.stem(word) for word in x))

In [19]:
data.head()

Unnamed: 0,sentence,sentiment
1,geosolutions technology leverage benefon gps s...,positive
2,esi lows bk real possibility,negative
3,last quarter componenta net sales doubled eur ...,positive
4,according finnish russian chamber commerce maj...,neutral
5,swedish buyout firm sold remaining percent sta...,neutral


In [20]:
data['sentence'] = data['sentence'].apply(lambda x: re.sub(r'\s\s*', " ", x))

In [21]:
data.replace('', np.nan, inplace=True)
data.dropna(axis=0,inplace=True)

In [22]:
data['sentence_len'] = data['sentence'].apply(lambda x: len(str(x).split()))
val = max(data['sentence_len'].values)
val

41

In [23]:
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(data['sentence'])
X = tokenizer.texts_to_sequences(data['sentence'])
X = pad_sequences(X)

In [24]:
X.shape

(5319, 31)

In [25]:
label = {'positive':0,'neutral':1,'negative':2}
data['sentiment'] = data['sentiment'].apply(lambda x:label[x])
data.head()

Unnamed: 0,sentence,sentiment,sentence_len
1,geosolutions technology leverage benefon gps s...,0,21
2,esi lows bk real possibility,2,5
3,last quarter componenta net sales doubled eur ...,0,20
4,according finnish russian chamber commerce maj...,1,11
5,swedish buyout firm sold remaining percent sta...,1,14


In [26]:
X_train, X_val, y_train, y_val = train_test_split(X,data['sentiment'],test_size=0.25,random_state=21)

In [27]:
X_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  14,   7,   9,   4, 156,  18,
        56, 492,   1,   1,  31], dtype=int32)

In [43]:
model = Sequential([
    Input(shape=(31,)),
    Embedding(500,120),
    SpatialDropout1D(0.4),
    LSTM(784, dropout=0.3, recurrent_dropout=0.3),
    Dense(300, activation="relu"),
    Dense(3, activation="softmax")
])

In [44]:
model.compile(
    optimizer = "rmsprop",
    loss = "sparse_categorical_crossentropy",
    metrics = ['accuracy']
)

In [45]:
model.summary()

In [46]:
model.fit(X_train, y_train, epochs = 20)

Epoch 1/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 68ms/step - accuracy: 0.5234 - loss: 1.0546
Epoch 2/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 56ms/step - accuracy: 0.6212 - loss: 0.8373
Epoch 3/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.6676 - loss: 0.7514
Epoch 4/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.6863 - loss: 0.7577
Epoch 5/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step - accuracy: 0.7025 - loss: 0.6814
Epoch 6/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 57ms/step - accuracy: 0.7148 - loss: 0.7010
Epoch 7/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 68ms/step - accuracy: 0.7359 - loss: 0.6324
Epoch 8/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 68ms/step - accuracy: 0.7419 - loss: 0.6182
Epoch 9/20
[1m125/125[0m [

<keras.src.callbacks.history.History at 0x7d8e4fa77760>

In [47]:
model.evaluate(X_val,y_val)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.7154 - loss: 0.7640


[0.8085481524467468, 0.704511284828186]

In [37]:
model.evaluate(X_val,y_val)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.7046 - loss: 0.7616


[0.8029329776763916, 0.699999988079071]

In [49]:
y_pred = model.predict(X_val)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step


In [51]:
y_classes = [np.argmax(x) for x in y_pred]

In [53]:
print(classification_report(y_val,y_classes))

              precision    recall  f1-score   support

           0       0.72      0.61      0.66       469
           1       0.71      0.87      0.78       712
           2       0.56      0.20      0.30       149

    accuracy                           0.70      1330
   macro avg       0.66      0.56      0.58      1330
weighted avg       0.69      0.70      0.68      1330



To be Continued to increase the f1-score of class 2 to increase the overall accuracy.