In [11]:
import os
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import numpy as np
import re
from bs4 import BeautifulSoup
from autocorrect import Speller
from langdetect import detect, DetectorFactory
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('omw-1.4', quiet=True)

DATA_PATH = "data/fakeReviewData.csv"
TEXT_COLUMN = "text"
RATING_COLUMN = "rating"
OUTPUT_PATH = "output/FakeReviewDataPreprocessed.csv"
MAX_FEATURES = 5000
MODEL_DIR = "models"
VECTORIZER_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")
FEATURE_NAMES_PATH = os.path.join(MODEL_DIR, "tfidf_feature_names.pkl")

In [12]:
def preprocess_text(text):
    if not text:
        return ""

    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(separator=' ')

    text = contractions.fix(text)

    text = re.sub(r'http\S+|www\S+|https\S+|\S+@\S+|\#\S+', '', text, flags=re.MULTILINE)
   
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

In [13]:
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"File not found: {DATA_PATH}")

data = pd.read_csv(DATA_PATH)

if TEXT_COLUMN not in data.columns:
    raise KeyError(f"Column '{TEXT_COLUMN}' not found in dataset.")

In [14]:
data[TEXT_COLUMN] = data[TEXT_COLUMN].fillna("").apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data[TEXT_COLUMN])
tfidf_features = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Add the rating column to the training data
data = pd.concat([data.reset_index(drop=True), tfidf_features.reset_index(drop=True)], axis=1)
data['rating'] = data[RATING_COLUMN].astype(float) # Directly use rating from input data

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
data.to_csv(OUTPUT_PATH, index=False)

os.makedirs(os.path.dirname(VECTORIZER_PATH), exist_ok=True)
joblib.dump(vectorizer, VECTORIZER_PATH)
joblib.dump(vectorizer.get_feature_names_out(), FEATURE_NAMES_PATH)

print(f"Processed data saved to {OUTPUT_PATH}")
print(f"Tfidf vectorizer saved to {VECTORIZER_PATH}")
print(f"Tfidf feature names saved to {FEATURE_NAMES_PATH}")

  soup = BeautifulSoup(text, 'html.parser')


Processed data saved to output/FakeReviewDataPreprocessed.csv
Tfidf vectorizer saved to models\tfidf_vectorizer.pkl
Tfidf feature names saved to models\tfidf_feature_names.pkl
