# Exploratory Data Analysis and Model Training

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import joblib
import os

## Load the Dataset

In [None]:
df = pd.read_csv('/Users/coditas/Desktop/Study/Assignments/HG/data/amazon.csv')

## Data Cleaning and Preprocessing

In [None]:
df['discount_percentage'] = df['discount_percentage'].str.replace('%', '').astype(float)
df['discounted_price'] = df['discounted_price'].str.replace('₹', '').str.replace(',', '').astype(float)
df['actual_price'] = df['actual_price'].str.replace('₹', '').str.replace(',', '').astype(float)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['rating_count'] = pd.to_numeric(df['rating_count'], errors='coerce')

df['rating'].fillna(df['rating'].mean(), inplace=True)
df['rating_count'].fillna(df['rating_count'].mean(), inplace=True)

## Feature Engineering

In [None]:
df['category'] = df['category'].astype('category').cat.codes
df['product_id'] = df['product_id'].astype('category').cat.codes

## Model Training

In [None]:
features = ['category', 'actual_price', 'rating', 'rating_count']
target = 'discount_percentage'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

## Model Evaluation

In [None]:
y_pred = model.predict(X_test)
print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}')
print(f'R2 Score: {r2_score(y_test, y_pred)}')

## Save the Model

In [None]:
if not os.path.exists('/Users/coditas/Desktop/Study/Assignments/HG/src/api/model'):
    os.makedirs('/Users/coditas/Desktop/Study/Assignments/HG/src/api/model')

joblib.dump(model, '/Users/coditas/Desktop/Study/Assignments/HG/src/api/model/discount_predictor.pkl')
joblib.dump(features, '/Users/coditas/Desktop/Study/Assignments/HG/src/api/model/model_columns.pkl')