In [None]:
import os
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import re
from bs4 import BeautifulSoup
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from tqdm import tqdm

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4', quiet=True)

DATA_PATH = "data/fakeReviewData.csv"
TEXT_COLUMN = "text"
RATING_COLUMN = "rating"
OUTPUT_PATH = "output/FakeReviewDataPreprocessed.csv"
MODEL_DIR = "models"
VECTORIZER_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.pkl")
FEATURE_NAMES_PATH = os.path.join(MODEL_DIR, "tfidf_feature_names.pkl")

In [2]:
def is_english(text):
    try:
        return detect(text) == "en"
    except LangDetectException:
        return False

data = pd.read_csv(DATA_PATH)

data["is_english"] = [is_english(text) for text in tqdm(data[TEXT_COLUMN].astype(str).fillna(""))]
data = data[data["is_english"]]
data.drop(columns=["is_english"], inplace=True)

100%|██████████| 40432/40432 [01:44<00:00, 385.54it/s]


In [3]:
def preprocess_text(text):
    if not text:
        return ""
    
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(separator=' ')
    
    text = contractions.fix(text)
    
    text = re.sub(r'http\S+|www\S+|https\S+|\S+@\S+|\#\S+', '', text, flags=re.MULTILINE)
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

In [4]:
data[TEXT_COLUMN] = [preprocess_text(text) for text in tqdm(data[TEXT_COLUMN].fillna(""))]

vectorizer = TfidfVectorizer(
    max_features=5000,
    max_df=0.85,
    min_df=5,
    ngram_range=(1,2),
    stop_words='english',
    strip_accents='unicode',
    sublinear_tf=True,
    use_idf=True,
    smooth_idf=True
)
tfidf_matrix = vectorizer.fit_transform(data[TEXT_COLUMN])
tfidf_features = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

data = pd.concat([data.reset_index(drop=True), tfidf_features.reset_index(drop=True)], axis=1)
data[RATING_COLUMN] = data[RATING_COLUMN].astype(float)

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
data.to_csv(OUTPUT_PATH, index=False)

os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(vectorizer, VECTORIZER_PATH)
joblib.dump(vectorizer.get_feature_names_out(), FEATURE_NAMES_PATH)

print(f"Processed data saved to {OUTPUT_PATH}")
print(f"TF-IDF vectorizer saved to {VECTORIZER_PATH}")
print(f"TF-IDF feature names saved to {FEATURE_NAMES_PATH}")

  soup = BeautifulSoup(text, 'html.parser')
100%|██████████| 40368/40368 [00:32<00:00, 1261.21it/s]


Processed data saved to output/FakeReviewDataPreprocessed.csv
TF-IDF vectorizer saved to models\tfidf_vectorizer.pkl
TF-IDF feature names saved to models\tfidf_feature_names.pkl
