In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('wordnet')
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Input, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv("/content/data.csv", names=["sentence","sentiment"])[1:]

In [None]:
data.head()

Unnamed: 0,sentence,sentiment
1,The GeoSolutions technology will leverage Bene...,positive
2,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
3,"For the last quarter of 2010 , Componenta 's n...",positive
4,According to the Finnish-Russian Chamber of Co...,neutral
5,The Swedish buyout firm has sold its remaining...,neutral


In [None]:
data.shape

(5842, 2)

In [None]:
data.drop_duplicates(subset=['sentence'], inplace=True)
data.dropna(axis=0, inplace=True)

In [None]:
data.replace('', np.nan, inplace=True)
data.dropna(axis=0, inplace=True)

In [None]:
data.shape

(5322, 2)

In [None]:
data['sentence'] = data['sentence'].apply(lambda x:x.lower())

In [None]:
data['sentence'] = data['sentence'].apply(lambda x: re.sub(r'"', "", x))

In [None]:
data['sentence'] = data['sentence'].apply(lambda x: re.sub(r'\([^)]*\)', "", x))

In [None]:
data['sentence'] = data['sentence'].apply(lambda x:re.sub("[^a-zA-Z]", " ", x))

In [None]:
data['sentence'] = data['sentence'].apply(lambda x: re.sub(r'\bs\b', "", x))

In [None]:
stop_words = stopwords.words('english')
data['sentence'] = data['sentence'].apply(lambda x: ' '.join(x for x in x.split() if x not in stop_words))

In [None]:
stemmer = PorterStemmer()
data['sentence'] = data['sentence'].apply(lambda x: ''.join(stemmer.stem(word) for word in x))

In [None]:
data.head()

Unnamed: 0,sentence,sentiment
1,geosolutions technology leverage benefon gps s...,positive
2,esi lows bk real possibility,negative
3,last quarter componenta net sales doubled eur ...,positive
4,according finnish russian chamber commerce maj...,neutral
5,swedish buyout firm sold remaining percent sta...,neutral


In [None]:
data['sentence'] = data['sentence'].apply(lambda x: re.sub(r'\s\s*', " ", x))

In [None]:
data.replace('', np.nan, inplace=True)
data.dropna(axis=0,inplace=True)

In [None]:
data['sentence_len'] = data['sentence'].apply(lambda x: len(str(x).split()))
val = max(data['sentence_len'].values)
val

41

In [None]:
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(data['sentence'])
X = tokenizer.texts_to_sequences(data['sentence'])
X = pad_sequences(X)

In [None]:
X.shape

(5319, 31)

In [None]:
label = {'positive':0,'neutral':1,'negative':2}
data['sentiment'] = data['sentiment'].apply(lambda x:label[x])
data.head()

Unnamed: 0,sentence,sentiment,sentence_len
1,geosolutions technology leverage benefon gps s...,0,21
2,esi lows bk real possibility,2,5
3,last quarter componenta net sales doubled eur ...,0,20
4,according finnish russian chamber commerce maj...,1,11
5,swedish buyout firm sold remaining percent sta...,1,14


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,data['sentiment'],test_size=0.25,random_state=21)

In [None]:
X_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,  14,   7,   9,   4, 156,  18,
        56, 492,   1,   1,  31], dtype=int32)

In [None]:
model = Sequential([
    Input(shape=(31,)),
    Embedding(500,120),
    SpatialDropout1D(0.4),
    LSTM(784, dropout=0.3, recurrent_dropout=0.3),
    Dense(300, activation="relu"),
    Dense(3, activation="softmax")
])

In [None]:
model.compile(
    optimizer = "rmsprop",
    loss = "sparse_categorical_crossentropy",
    metrics = ['accuracy']
)

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs = 20)

Epoch 1/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 68ms/step - accuracy: 0.5234 - loss: 1.0546
Epoch 2/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 56ms/step - accuracy: 0.6212 - loss: 0.8373
Epoch 3/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.6676 - loss: 0.7514
Epoch 4/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.6863 - loss: 0.7577
Epoch 5/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step - accuracy: 0.7025 - loss: 0.6814
Epoch 6/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 57ms/step - accuracy: 0.7148 - loss: 0.7010
Epoch 7/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 68ms/step - accuracy: 0.7359 - loss: 0.6324
Epoch 8/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 68ms/step - accuracy: 0.7419 - loss: 0.6182
Epoch 9/20
[1m125/125[0m [

<keras.src.callbacks.history.History at 0x7d8e4fa77760>

In [None]:
model.evaluate(X_val,y_val)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.7154 - loss: 0.7640


[0.8085481524467468, 0.704511284828186]

In [None]:
model.evaluate(X_val,y_val)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.7046 - loss: 0.7616


[0.8029329776763916, 0.699999988079071]

In [None]:
y_pred = model.predict(X_val)

[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step


In [None]:
y_classes = [np.argmax(x) for x in y_pred]

In [None]:
print(classification_report(y_val,y_classes))

              precision    recall  f1-score   support

           0       0.72      0.61      0.66       469
           1       0.71      0.87      0.78       712
           2       0.56      0.20      0.30       149

    accuracy                           0.70      1330
   macro avg       0.66      0.56      0.58      1330
weighted avg       0.69      0.70      0.68      1330



To be Continued to increase the f1-score of class 2 to increase the overall accuracy.

# **Transformer Application**

In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv("/content/data.csv", names=["sentence","sentiment"])[1:]

In [2]:
reviews = data[:300]

In [3]:
phrase = reviews['sentence'].tolist()

In [4]:
reviews['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,158
positive,95
negative,47


In [5]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
inputs = tokenizer(phrase, padding=True, truncation=True, return_tensors="pt")

In [7]:
inputs

{'input_ids': tensor([[  101,  1996, 20248,  ...,     0,     0,     0],
        [  101,  1002,  9686,  ...,     0,     0,     0],
        [  101,  2005,  1996,  ...,     0,     0,     0],
        ...,
        [  101,  1001, 26060,  ...,     0,     0,     0],
        [  101,  1996,  2171,  ...,     0,     0,     0],
        [  101,  5658,  4341,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [8]:
inputs['input_ids'].shape

torch.Size([300, 80])

In [9]:
from transformers import AutoModelForSequenceClassification

label = {'positive':0,'neutral':1,'negative':2}
checkpoint = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3,id2label=label)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
outputs = model(**inputs)

In [11]:
outputs.logits.shape

torch.Size([300, 3])

In [12]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[0.3546, 0.3010, 0.3443],
        [0.3505, 0.2921, 0.3574],
        [0.3620, 0.2966, 0.3414],
        [0.3464, 0.3113, 0.3423],
        [0.3555, 0.2895, 0.3550],
        [0.3471, 0.3047, 0.3482],
        [0.3470, 0.2961, 0.3569],
        [0.3571, 0.2941, 0.3488],
        [0.3502, 0.3074, 0.3424],
        [0.3567, 0.3016, 0.3417],
        [0.3594, 0.2997, 0.3409],
        [0.3478, 0.2885, 0.3637],
        [0.3680, 0.2918, 0.3402],
        [0.3512, 0.3043, 0.3445],
        [0.3544, 0.2973, 0.3483],
        [0.3411, 0.2969, 0.3621],
        [0.3530, 0.2896, 0.3574],
        [0.3412, 0.3048, 0.3539],
        [0.3513, 0.2919, 0.3568],
        [0.3566, 0.2865, 0.3569],
        [0.3338, 0.3136, 0.3526],
        [0.3570, 0.2903, 0.3527],
        [0.3498, 0.3097, 0.3405],
        [0.3493, 0.2931, 0.3575],
        [0.3632, 0.2972, 0.3396],
        [0.3384, 0.3172, 0.3444],
        [0.3429, 0.2967, 0.3603],
        [0.3468, 0.3019, 0.3514],
        [0.3358, 0.3100, 0.3541],
        [0.339

In [13]:
model.config.id2label

{'positive': 0, 'neutral': 1, 'negative': 2}

In [14]:
preds = predictions.tolist()

In [25]:
preds

[[0.3546488285064697, 0.30104413628578186, 0.344307005405426],
 [0.3504679501056671, 0.2920888066291809, 0.35744327306747437],
 [0.36198848485946655, 0.2965872287750244, 0.34142428636550903],
 [0.3464478850364685, 0.3112509250640869, 0.3423011600971222],
 [0.35549119114875793, 0.289540559053421, 0.35496821999549866],
 [0.34709879755973816, 0.30471736192703247, 0.34818384051322937],
 [0.3470361828804016, 0.2960652709007263, 0.35689854621887207],
 [0.3570735454559326, 0.29408058524131775, 0.348845899105072],
 [0.3501541018486023, 0.30744239687919617, 0.34240350127220154],
 [0.35674721002578735, 0.3015626072883606, 0.34169021248817444],
 [0.3594134449958801, 0.2996963858604431, 0.34089016914367676],
 [0.3477589786052704, 0.2884998023509979, 0.3637412190437317],
 [0.3679579198360443, 0.2918426990509033, 0.34019938111305237],
 [0.3511579632759094, 0.30434054136276245, 0.3445014953613281],
 [0.35441210865974426, 0.297277569770813, 0.34831029176712036],
 [0.3410503566265106, 0.296864062547683

In [15]:
out_labels = [np.argmax(pred) for pred in preds]

In [19]:
reviews['labels'] = reviews['sentiment'].apply(lambda x:label[x])
y = reviews['labels'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews['labels'] = reviews['sentiment'].apply(lambda x:label[x])


In [23]:
y[:5]

[0, 2, 0, 1, 1]

In [26]:
out_labels[:5]

[0, 2, 0, 0, 0]