In [16]:
import pandas as pd
df = pd.read_csv('../data/flipkart_product.csv', encoding='latin1')
df['Review'] = df['Review'].astype(str).str.replace('\n', ' ').str.strip()
df['Summary'] = df['Summary'].astype(str).str.replace('\n', ' ').str.strip()
df['full_text'] = df['Summary'] + ' ' + df["Review"]

def sentiment_label(rate):
    if rate ==2 or rate < 2: return 0 #negative
    elif rate == 3: return 1 #neutral
    else: return 2 # positive

df['Rate'] = pd.to_numeric(df['Rate'], errors = 'coerce')    
df['sentiment'] = df['Rate'].apply(sentiment_label)

df['fake_label'] = ((df['full_text'].str.len() < 30) & (df['Rate'] >= 4)).astype(int)

df.to_csv('../data/cleaned.csv', index = False)

print(df.head())

                                         ProductName     Price  Rate  \
0  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999   5.0   
1  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999   5.0   
2  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999   3.0   
3  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999   1.0   
4  Candes 12 L Room/Personal Air Cooler?ÿ?ÿ(White...  ??3,999   3.0   

            Review                                            Summary  \
0           Super!  Great cooler.. excellent air flow and for this...   
1          Awesome             Best budget 2 fit cooler. Nice cooling   
2             Fair  The quality is good but the power of air is de...   
3  Useless product                 Very bad product it's a only a fan   
4             Fair                                      Ok ok product   

                                           full_text  sentiment  fake_label  
0  Great cooler.. excellent air flow and for this.

In [None]:
from sklearn.model_selection import train_test_split

X = df['full_text'].values
y_sentiment = df['sentiment'].values
y_fake = df['fake_label'].values

X_train, X_test, y_sent_train, y_sent_test, y_fake_train, y_fake_test = train_test_split(
    X, y_sentiment, y_fake, test_size=0.2, random_state=42
)



In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 5000 #consider top 5000 words
max_len = 100 # max sequence length

tokenizer = Tokenizer(num_words = max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

model_sentiment = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(3, activation='softmax') #3 classes: negative, neutral, positive
])

model_sentiment.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_sentiment.summary()

2025-09-16 13:32:41.993602: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [27]:
history_sent = model_sentiment.fit(X_train_pad, y_sent_train, 
                                   epochs=5,
                                   batch_size=64,
                                   validation_data=(X_test_pad, y_sent_test))

Epoch 1/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 68ms/step - accuracy: 0.7807 - loss: 0.6798 - val_accuracy: 0.7837 - val_loss: 0.6656
Epoch 2/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m180s[0m 76ms/step - accuracy: 0.7807 - loss: 0.6748 - val_accuracy: 0.7838 - val_loss: 0.6662
Epoch 3/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 74ms/step - accuracy: 0.9609 - loss: 0.1297 - val_accuracy: 0.9821 - val_loss: 0.0629
Epoch 4/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 78ms/step - accuracy: 0.9821 - loss: 0.0672 - val_accuracy: 0.9835 - val_loss: 0.0577
Epoch 5/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 74ms/step - accuracy: 0.9835 - loss: 0.0599 - val_accuracy: 0.9836 - val_loss: 0.0577


In [31]:
model_fake = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid') #binary classification
])

model_fake.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history_fake = model_fake.fit(X_train_pad, y_fake_train,
                              epochs = 5,
                              batch_size = 64,
                              validation_data = (X_test_pad, y_fake_test))

Epoch 1/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 76ms/step - accuracy: 0.6195 - loss: 0.6660 - val_accuracy: 0.6150 - val_loss: 0.6668
Epoch 2/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m190s[0m 80ms/step - accuracy: 0.6197 - loss: 0.6647 - val_accuracy: 0.6150 - val_loss: 0.6664
Epoch 3/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 82ms/step - accuracy: 0.6197 - loss: 0.6643 - val_accuracy: 0.6150 - val_loss: 0.6664
Epoch 4/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m191s[0m 80ms/step - accuracy: 0.6197 - loss: 0.6641 - val_accuracy: 0.6150 - val_loss: 0.6662
Epoch 5/5
[1m2374/2374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 94ms/step - accuracy: 0.8563 - loss: 0.2994 - val_accuracy: 0.9638 - val_loss: 0.1100


In [32]:
model_sentiment.save('../models/sentiment_model.h5')
model_fake.save('../models/fake_model.h5')

