In [1]:
# Import Libraries
import pandas as pd
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split # For splitting data
from sklearn.linear_model import LogisticRegression # ML model
from sklearn.metrics import classification_report, confusion_matrix #Evaluation metrics

# 1) Load raw CSV
df = pd.read_csv("Data/fake reviews dataset.csv")

# 2) Rename the text column for convenience
df.rename(columns={'text_': 'text'}, inplace=True)

# 3) Define a text-cleaning function
def clean_text(text):
    text = text.lower()                        # lowercase
    text = re.sub(r'<.*?>', '', text)          # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)       # drop punctuation/numbers
    return text.strip()

# 4) Apply cleaning and Feature Engineering
df['cleaned_text'] = df['text'].apply(clean_text) # apply cleaning
df['review_length'] = df['cleaned_text'].apply(lambda t: len(t.split())) # number of words
df['sentiment'] = df['cleaned_text'].apply(lambda t: TextBlob(t).sentiment.polarity) # sentiment score

# 5) Prepare feature matrix X and target y
X = df[['review_length', 'rating', 'sentiment']]
y = df['label'].map({'CG': 1, 'OR': 0})  # CG = fake (1), OR = real (0)

# 6) Split into train and test sets (80/20, stratified to keep labels balanced)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 7) Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000) #create model
clf.fit(X_train, y_train) #train on training data

# 8) Evaluate on the test set
y_pred = clf.predict(X_test) # make prediction on the test set
print("*** Classification Report ***")
print(classification_report(y_test, y_pred, target_names=["Real (OR)", "Fake (CG)"]))
print("*** Confusion Matrix ***")
print(confusion_matrix(y_test, y_pred))



*** Classification Report ***
              precision    recall  f1-score   support

   Real (OR)       0.53      0.50      0.51      4044
   Fake (CG)       0.53      0.56      0.55      4043

    accuracy                           0.53      8087
   macro avg       0.53      0.53      0.53      8087
weighted avg       0.53      0.53      0.53      8087

*** Confusion Matrix ***
[[2012 2032]
 [1765 2278]]


In [None]:
# Show info of df
df.head()

Unnamed: 0,category,rating,label,text,cleaned_text,review_length,sentiment
0,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor...",love this well made sturdy and very comfortab...,12,0.4425
1,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I...",love it a great upgrade from the original ive...,16,0.558333
2,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...,this pillow saved my back i love the look and ...,14,0.25
3,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i...",missing information on how to use it but it is...,17,0.3
4,Home_and_Kitchen_5,5,CG,Very nice set. Good quality. We have had the s...,very nice set good quality we have had the set...,18,0.74


In [2]:
# TF-IDF
# Captures word frequency importance
# Limitations: Doesn't capture word order or context

import pandas as pd
import re
from textblob import TextBlob
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline                                   # Combine preprocessign and modeling
from sklearn.compose import ColumnTransformer                           # Different preprocessign for different columns
from sklearn.feature_extraction.text import TfidfVectorizer             # TF-IDF for text vectorization
from sklearn.preprocessing import StandardScaler                        # Normalize numerical features
from sklearn.ensemble import RandomForestClassifier                     # Classifier
from sklearn.metrics import classification_report, confusion_matrix     # EValuate metrics

# 1) Load raw data
df = pd.read_csv("Data/fake reviews dataset.csv")

# 2) Rename the text column
df.rename(columns={'text_': 'text'}, inplace=True)

# 3) Clean text and compute EDA features
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)    # strip HTML
    text = re.sub(r'[^a-z\s]', '', text) # keep only letters/spaces
    return text.strip()

df['cleaned_text'] = df['text'].apply(clean_text)
df['review_length'] = df['cleaned_text'].apply(lambda t: len(t.split()))
df['sentiment']     = df['cleaned_text'].apply(lambda t: TextBlob(t).sentiment.polarity)

# 4) Prepare features (including the raw cleaned_text) and labels
X = df[['cleaned_text', 'review_length', 'rating', 'sentiment']]
y = df['label'].map({'CG': 1, 'OR': 0})  # 1=fake, 0=real

# 5) Split into train/test (80/20, stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 6) Build a ColumnTransformer that:
#    - vectorizes the cleaned_text via TF-IDF (unigrams + bigrams)
#    - standardizes the numeric features
#    - TF-IDF (Text Feature Extraction): transforms the review text into numerical vectors
preprocessor = ColumnTransformer([
    ("tfidf",  TfidfVectorizer(max_features=5000, ngram_range=(1,2)), 'cleaned_text'), # n-gram range(1,2) captures both single words and bigrams 
    ("scale",  StandardScaler(), ['review_length', 'rating', 'sentiment'])
])

# 7) Create a full Pipeline: preprocessor + classifier
#    - Random Forest Classifier: a powerful ensemble classifier based on decision trees
#    - It works well with mixed types of data (text + numeric)
pipeline = Pipeline([('preprocessing', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 8) 5-fold cross-validation on the training set
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1')
print("5-fold F1 scores:", cv_scores)
print("Mean F1 score  :", cv_scores.mean())

# 9) Fit on training data and evaluate on test data
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("\n*** Test Set Classification Report ***")
print(classification_report(y_test, y_pred, target_names=["Real (OR)", "Fake (CG)"]))
print("*** Test Set Confusion Matrix ***")
print(confusion_matrix(y_test, y_pred))


5-fold F1 scores: [0.90189219 0.88817891 0.88692863 0.8925144  0.89624443]
Mean F1 score  : 0.893151712171402

*** Test Set Classification Report ***
              precision    recall  f1-score   support

   Real (OR)       0.88      0.94      0.91      4044
   Fake (CG)       0.93      0.87      0.90      4043

    accuracy                           0.90      8087
   macro avg       0.90      0.90      0.90      8087
weighted avg       0.90      0.90      0.90      8087

*** Test Set Confusion Matrix ***
[[3788  256]
 [ 528 3515]]


- F1 scores across the 5 folds are all around 0.89-0.90, showing stable performance

- A mean F1 = 0.89 indicates that, on average, about 89-90% of the labels are being correctly classified when I cross-validate on the training data

- Test-Set Performance: 
- Accuracy = 0.90
