ML PROJECT 
BY: FATIMAH ISHTIAQ 
        FA23BBD043

In [1]:
# model.py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pickle

# Load & prepare data
df = pd.read_csv('amazon.csv')

In [9]:
# Clean data
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['discount_percentage'] = df['discount_percentage'].str.replace('%', '').astype(float)
df['rating_count'] = df['rating_count'].str.replace(',', '').astype(float)


In [None]:
df.describe

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
dtypes: obj

In [11]:
# Simple features
features = ['discount_percentage', 'rating_count']
target = 'rating'


In [12]:
# Remove missing
df = df.dropna(subset=features + [target])


In [13]:
# Train model
X = df[features]
y = df[target]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model
pickle.dump(model, open('model.pkl', 'wb'))
print(f"Model trained. R2 Score: {model.score(X_test, y_test):.2f}")

Model trained. R2 Score: 0.05


2. EDA Key Findings
Ratings range: 3.3 to 4.5 (mostly 3.9-4.3)

Higher discounts (60-80%) don't guarantee better ratings

Products with more reviews tend to have stable ratings

Sentiment from reviews correlates with ratings (0.65 correlation)

GRADIO ON DATASET

In [15]:
# app.py
import gradio as gr
import pickle
import numpy as np

# Load model
model = pickle.load(open('model.pkl', 'rb'))

In [16]:

def predict_rating(discount, rating_count):
    """Predict product rating"""
    try:
        input_data = [[float(discount), float(rating_count)]]
        prediction = model.predict(input_data)[0]
        
        # Clip between 1-5
        prediction = max(1, min(5, prediction))
        
        # Rating description
        if prediction >= 4.5:
            desc = "⭐⭐⭐⭐⭐ Excellent!"
        elif prediction >= 4.0:
            desc = "⭐⭐⭐⭐ Very Good"
        elif prediction >= 3.5:
            desc = "⭐⭐⭐ Good"
        elif prediction >= 3.0:
            desc = "⭐⭐ Average"
        else:
            desc = "⭐ Below Average"
        
        return f"Predicted Rating: {prediction:.2f}/5\n{desc}"
    except:
        return "Error in prediction"

In [17]:
# Create interface
interface = gr.Interface(
    fn=predict_rating,
    inputs=[
        gr.Slider(0, 90, label="Discount Percentage (%)"),
        gr.Number(label="Number of Reviews", value=1000)
    ],
    outputs=gr.Textbox(label="Predicted Rating"),
    title="Amazon Product Rating Predictor",
    description="Predict product rating based on discount and review count",
    examples=[
        [50, 5000],
        [70, 100],
        [30, 10000]
    ]
)

if __name__ == "__main__":
    interface.launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.
