In [1]:
import os
import pandas as pd
import joblib
import sys
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import re
from bs4 import BeautifulSoup
from autocorrect import Speller
from langdetect import detect, DetectorFactory

# Get the absolute path of the directory containing the current notebook
current_dir = os.getcwd()

# Add checkpoint 3 for scraped data
checkpoint3_dir = os.path.abspath(os.path.join(current_dir, '..', 'checkpoint 3'))
sys.path.append(checkpoint3_dir)


MODEL_DIR = os.path.join("..","checkpoint 2","models")
MODEL_PATH = os.path.join(MODEL_DIR, "logistic_regression_model.pkl")

SCRAPED_REVIEWS_PATH = os.path.join(checkpoint3_dir, "scraped_reviews.csv")
# Define the path for feature names.
FEATURE_NAMES_PATH = os.path.join("..","checkpoint 1","models","tfidf_feature_names.pkl")

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('omw-1.4', quiet=True)
DetectorFactory.seed = 0
spell = Speller(lang='en')

In [2]:
def preprocess_text(text):
    if not text:
        return ""

    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text(separator=' ')

    text = contractions.fix(text)

    text = re.sub(r'http\S+|www\S+|https\S+|\S+@\S+|\#\S+', '', text, flags=re.MULTILINE)
   
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    tokens = [spell(word) for word in tokens]

    tokens = [word.encode('ascii', 'ignore').decode('ascii') for word in tokens]

    tokens = [word for word in tokens if len(word) > 2]
    return ' '.join(tokens)

def preprocess_new_text(text, rating):
    """Preprocesses new text and rating using a trained vectorizer."""
    
    if not os.path.exists(FEATURE_NAMES_PATH):
        raise FileNotFoundError(f"Feature Names not found: {FEATURE_NAMES_PATH}. Please train vectorizer first")
    
    if not os.path.exists(os.path.join("..","checkpoint 1","models","tfidf_vectorizer.pkl")):
       raise FileNotFoundError(f"Vectorizer not found: {os.path.join('..','checkpoint 1','models','tfidf_vectorizer.pkl')}. Please train vectorizer first")
    
    vectorizer = joblib.load(os.path.join("..","checkpoint 1","models","tfidf_vectorizer.pkl"))
    feature_names = joblib.load(FEATURE_NAMES_PATH)

    try:
        if detect(text) != 'en':
            return pd.DataFrame(columns=feature_names.tolist() + ['rating'])
    except:
         return pd.DataFrame(columns=feature_names.tolist() + ['rating'])

    preprocessed_text = preprocess_text(text)
    tfidf_matrix = vectorizer.transform([preprocessed_text])
    tfidf_features = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    
    tfidf_features['rating'] = float(rating)
    
    tfidf_features = tfidf_features.reindex(columns = feature_names.tolist() + ['rating'], fill_value=0)
    return tfidf_features

In [3]:
def load_model():
   if not os.path.exists(MODEL_PATH):
      raise FileNotFoundError(f"Model file not found: {MODEL_PATH}. Please train model first.")
   return joblib.load(MODEL_PATH)
model = load_model()
scraped_data = pd.read_csv(SCRAPED_REVIEWS_PATH)

In [4]:
def predict_fake_review(row):
    """
    Predicts whether a given text review is fake or not, using preprocessed text and rating.

    Args:
       row(pd.Series): Row from the scraped data.
    Returns:
        int: Predicted label (0 for not fake, 1 for fake)
    """
    try:
        text = row['text']
        rating = row['rating']
        preprocessed_features = preprocess_new_text(text, rating)
        prediction = model.predict(preprocessed_features)[0]
        return prediction
    except Exception as e:
        print(f"Error: {e}")
        return None

In [5]:
scraped_data['predicted_label'] = scraped_data.apply(predict_fake_review, axis=1)

print("Predictions applied.")

scraped_data

  soup = BeautifulSoup(text, 'html.parser')
  soup = BeautifulSoup(text, 'html.parser')


Error: object of type 'float' has no len()
Predictions applied.


Unnamed: 0,text,rating,predicted_label
0,The Apple iPhone XR is an excellent device wit...,5.0,1.0
1,"Battery good durability, all functions good, l...",5.0,1.0
2,The reconditioned iPhone XR was in excellent c...,5.0,1.0
3,I have had this product for a few years and it...,4.0,1.0
4,My fiancée ordered this and has been using it ...,5.0,1.0
...,...,...,...
95,"Nice phone for a good price, works like new",4.0,1.0
96,Bueno,4.0,1.0
97,"this phone works great for it for my brother, ...",4.0,1.0
98,I got it for a present and it works great!,4.0,1.0


In [8]:
scraped_data['predicted_label'].isnull().sum()  

1

In [6]:
scraped_data['predicted_label'].value_counts()

predicted_label
1.0    92
0.0     7
Name: count, dtype: int64