In [1]:
import os
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [2]:
DATA_PATH = "data/fakeReviewData.csv"
TEXT_COLUMN = "text"
OUTPUT_PATH = "output/FakeReviewDataPreprocessed.csv"
MAX_FEATURES = 5000

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"File not found: {DATA_PATH}")

data = pd.read_csv(DATA_PATH)

if TEXT_COLUMN not in data.columns:
    raise KeyError(f"Column '{TEXT_COLUMN}' not found in dataset.")
data

Unnamed: 0,category,rating,label,text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...


In [3]:
def preprocess_text(text):
    if not text:
        return ""

    text = contractions.fix(text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

data[TEXT_COLUMN] = data[TEXT_COLUMN].fillna("").apply(preprocess_text)

In [4]:
vectorizer = TfidfVectorizer(max_features=MAX_FEATURES, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data[TEXT_COLUMN])
tfidf_features = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

data = pd.concat([data.reset_index(drop=True), tfidf_features.reset_index(drop=True)], axis=1)

In [5]:
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
data.to_csv(OUTPUT_PATH, index=False)

print(f"Processed data saved to {OUTPUT_PATH}")

Processed data saved to output/FakeReviewDataPreprocessed.csv
