In [1]:
# Word Embeddings
# It converts words into dense vectors caputring semantic meaning

import numpy as np
import pandas as pd
import re
import os
from textblob import TextBlob
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# 1) Load data
df = pd.read_csv("Data/fake reviews dataset.csv")
df.rename(columns={'text_': 'text'}, inplace=True)

# 2) Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)
df['review_length'] = df['cleaned_text'].apply(lambda t: len(t.split()))
df['sentiment']     = df['cleaned_text'].apply(lambda t: TextBlob(t).sentiment.polarity)
df['label'] = df['label'].map({'CG': 1, 'OR': 0})

# 3) Load pre-trained GloVe vectors (downloaded from https://nlp.stanford.edu/projects/glove/)
glove_path = "Data/glove.6B.300d.txt"  # Use 300d 
embeddings_index = {}
with open(glove_path, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector
print("Loaded word vectors:", len(embeddings_index))

# 4) Convert cleaned review text to vector average
def get_average_vector(text, embeddings_index, dim=300):
    words = text.split()
    valid_vectors = [embeddings_index[word] for word in words if word in embeddings_index]
    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(dim)  # ensures consistent shape


df['embedding'] = df['cleaned_text'].apply(lambda x: get_average_vector(x, embeddings_index, dim=300))
embedding_matrix = np.stack(df['embedding'].values)  # this should now work


# 5) Combine embeddings with numerical features
X = np.hstack([embedding_matrix, df[['review_length', 'rating', 'sentiment']].values])
y = df['label'].values

# 6) Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 7) Train model
pipeline = Pipeline([
    ("scale", StandardScaler()), # Normalize feature magnitude
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])
# Cross-validation to evaluate F1-score robustness
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")
print("5-fold F1 scores:", cv_scores)
print("Mean F1 score  :", cv_scores.mean())

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
# 8) Print performance metrics
print("\n*** Test Set Classification Report ***")
print(classification_report(y_test, y_pred, target_names=["Real (OR)", "Fake (CG)"]))
print("*** Test Set Confusion Matrix ***")
print(confusion_matrix(y_test, y_pred))


Loaded word vectors: 400000
5-fold F1 scores: [0.85270837 0.84387137 0.83741996 0.845867   0.84414984]
Mean F1 score  : 0.8448033059970991

*** Test Set Classification Report ***
              precision    recall  f1-score   support

   Real (OR)       0.85      0.85      0.85      4044
   Fake (CG)       0.85      0.85      0.85      4043

    accuracy                           0.85      8087
   macro avg       0.85      0.85      0.85      8087
weighted avg       0.85      0.85      0.85      8087

*** Test Set Confusion Matrix ***
[[3430  614]
 [ 608 3435]]


# BERT


In [2]:
import pandas as pd
import re
import numpy as np
from datasets import Dataset   # Hugginf Face Transformers & Datasets
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import classification_report, confusion_matrix # For evaluation metrics

# 1) Load and clean dataset
df = pd.read_csv("Data/fake reviews dataset.csv")
df.rename(columns={'text_': 'text'}, inplace=True)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)
df['label'] = df['label'].map({'CG': 1, 'OR': 0})  # 1 = fake, 0 = real
df = df[['cleaned_text', 'label']]

# 2) HuggingFace Dataset
hf_dataset = Dataset.from_pandas(df)
hf_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42)

# 3) Tokenize
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["cleaned_text"], padding="max_length", truncation=True, max_length=128)
# Apply tokenizer to entire dataset
tokenized_dataset = hf_dataset.map(tokenize, batched=True)

# 4) Load DistilBERT
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# 5) Legacy-compatible TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,                  # Enable training
    do_eval=True,                   # Enable evaluation during training
    per_device_train_batch_size=8,  # Training batch size
    per_device_eval_batch_size=8,   # Eval batch size
    num_train_epochs=2,             # Train for 2 epochs
    logging_dir="./logs"            # Diretory to save logs
)

# 6) Data collator will pad input sequences during training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Create trainer (handles training, evaluation, and saving)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 7) Train the model
trainer.train()

# 8) Evaluate
predictions = trainer.predict(tokenized_dataset["test"])
y_true = predictions.label_ids
y_pred = np.argmax(predictions.predictions, axis=1)
# Print performance & confusion matrix
print("\n=== Test Set Classification Report ===")
print(classification_report(y_true, y_pred, target_names=["Real (OR)", "Fake (CG)"]))
print("=== Test Set Confusion Matrix ===")
print(confusion_matrix(y_true, y_pred))


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 32345/32345 [00:01<00:00, 17869.22 examples/s]
Map: 100%|██████████| 8087/8087 [00:00<00:00, 19277.24 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
500,0.3436
1000,0.2186
1500,0.1688
2000,0.1679
2500,0.161
3000,0.1488
3500,0.1359
4000,0.0964
4500,0.0634
5000,0.0609



=== Test Set Classification Report ===
              precision    recall  f1-score   support

   Real (OR)       0.99      0.96      0.97      4027
   Fake (CG)       0.96      0.99      0.97      4060

    accuracy                           0.97      8087
   macro avg       0.97      0.97      0.97      8087
weighted avg       0.97      0.97      0.97      8087

=== Test Set Confusion Matrix ===
[[3852  175]
 [  57 4003]]


Why DistilBERT Performs Better
-Context-aware embedding: Unlike TF-IDF or GloVe, BERT-based models understand the context of words in a sentence. 
-Pretrained on massive data: DistilBERT was trained on billions of tokens-enabling to generalize better to fake vs. real sentiment patterns
-Fine-tuned end-to-end: The classifier is trained with gradient updates through the entire transformer, not just on top of static featrues