In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import gradio as gr

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load the data
df = pd.read_csv('product_reviews.csv')

# Feature engineering
def extract_features(df):
    df['review_length'] = df['review_body'].str.len()
    df['sentiment_score'] = df['review_body'].apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x)['compound'])
    return df

# Preprocess text
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    return ' '.join([word for word in tokens if word not in stopwords.words('english')])

# Create features and target
X = extract_features(df)
y = (X['verified_purchase'] == 'N') & (X['helpful_votes'] == 0) & (X['total_votes'] == 0)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = Pipeline([
    ('tfidf', TfidfVectorizer(preprocessor=preprocess_text)),
    ('clf', RandomForestClassifier())
])

model.fit(X_train['review_body'], y_train)

# Gradio interface
def predict_spam(review_text):
    features = extract_features(pd.DataFrame({'review_body': [review_text]}))
    prediction = model.predict(features['review_body'])[0]
    return "Spam" if prediction else "Not Spam"

iface = gr.Interface(
    fn=predict_spam,
    inputs="text",
    outputs="text",
    title="Spam Review Detector"
)

iface.launch()
