## Import all Module

In [2]:
#importing all
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

## Load the Dataset

In [3]:
df = pd.read_csv('final_health_supplement.csv')

In [4]:
df

Unnamed: 0,Product Name,Discount Prices(?),Actual Prices(?),Weight/Type,Discount Percentage(%),Rating,Users
0,Wellcore Micronised Creatine Monohydrate | Lab...,599,749,"122 g, Tropical Tango",20,4.3,251934
1,BIGMUSCLES NUTRITION Premium Gold Whey Protein,1499,2899,"1 kg, Belgian Chocolate",48,4.1,99351
2,NutraEats Protein Plus Supplement Whey Protein...,459,599,"200 g, Chocolate",23,4.3,251934
3,NutraEats Protein Plus Supplement Whey Protein...,489,899,"300 g, Chocolate",45,4.3,251934
4,NutraEats Protein Plus Supplement Whey Protein...,528,799,"250 g, Chocolate",33,4.2,1467
...,...,...,...,...,...,...,...
2779,Vaseline Healthy Bright Sun+Pollution Protecti...,593,499,400 ml,85,4.4,20469
2780,efatop NEXT Lotion 150ml For Intense Moisturizer,569,300,150 ml,55,3.9,17
2781,Rubz Cherry Blossom Body Milk with Cherry Blos...,353,525,1 L,22,4.1,1167
2782,Parachute Advansed Advansed Soft Touch Body Lo...,349,80,800 ml,24,4.6,269


## Creating train set ,test set by using StratifiedShuffleSplit

In [5]:
#crating train set by taking 80% data and creating test set by taking 20% data 
df['income_cat'] = pd.cut(df['Discount Prices(?)'],bins=[0.0,1.5,3.0,4.5,6.0,np.inf],labels=[1,2,3,4,5])
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(df,df['income_cat']):
    strat_train_set = df.loc[train_index].drop('income_cat',axis=1)
    strat_test_set = df.loc[test_index].drop('income_cat',axis=1)


In [6]:
strat_train_set

Unnamed: 0,Product Name,Discount Prices(?),Actual Prices(?),Weight/Type,Discount Percentage(%),Rating,Users
832,Multani Pachmeena Tonik Digestive Syrup | Reli...,360,270,450 ml,20,4.2,3853
1451,Chemist at Play Vitamin C Neck Knee Elbow Brig...,205,500,40 ml,45,4.2,51
491,"BabyOrgano Swarnaprashan - Immunity, Concentra...",263,300,15 ml,8,4.4,565
1469,"The Plant Fix Plix Jamun Skincare Combo, Clean...",943,1694,3 Items in the set,87,4.2,24378
2310,UrbanMooch SPF 50 PA+++Sunscreen Body Lotion |...,562,499,200 ml,9,4.5,122214
...,...,...,...,...,...,...,...
237,N2B 100% Whey Gold Protein Supplement Powder W...,1154,15549,"2 kg, Gourmet Chocolate",52,4.0,442
1836,mCaffeine Summer essential Skincare Kit for Ex...,314,435,1 Items in the set,16,4.3,43936
65,AS-IT-IS Nutrition Pea Protein Isolate-Designe...,586,827,"500 g, Unflavored",57,4.2,858
421,NURAMENT original Cream,280,380,2 x 20 g,37,3.9,13397


In [7]:
strat_test_set

Unnamed: 0,Product Name,Discount Prices(?),Actual Prices(?),Weight/Type,Discount Percentage(%),Rating,Users
878,Baidyanath Neem Tablets- A known Ayurvedic Her...,250,798,"Pack of 2, 60 Tablets, 60 Tablets",30,4.3,351
833,SEARCH WELLNESS Diabetes Care Juice 500ml For ...,940,280,500 ml,20,4.3,9321
1152,GLAMVEDA Glass Skin Rice & Ceramide 3 Step Dai...,569,1199,3 Items in the set,6,4.8,4
1484,voorkoms Acne Scars Cure & Herbal Bath Combo F...,574,1494,100 g,87,4.7,24
684,smile4u 28 Days or 4 Weeks Pill Box,611,290,Multicolor,36,3.9,153
...,...,...,...,...,...,...,...
1390,Everyuth Naturals Set Of 2 Exfoliating Walnut ...,195,598,200 g,10,4.2,17268
1819,Herbdiva British Rose Nourishing Glow Facial K...,318,399,100 ml,40,4.2,6897
337,JOLLY Tulsi 51 Drops Natural Immunity Booster,156,195,24 ml,36,4.2,2617
1693,HealthBest Kidbest Facewash for 3-13 Years Kid...,380,530,500 g,63,4.4,1930


##  We will work on the copy of the training set

In [8]:
health = strat_train_set.copy()

## Let's taking Feature and Labels from Train set

In [9]:
health_feature = strat_train_set['Product Name']
health_labels =  strat_train_set.drop('Product Name',axis=1)

In [10]:
health_feature

832     Multani Pachmeena Tonik Digestive Syrup | Reli...
1451    Chemist at Play Vitamin C Neck Knee Elbow Brig...
491     BabyOrgano Swarnaprashan - Immunity, Concentra...
1469    The Plant Fix Plix Jamun Skincare Combo, Clean...
2310    UrbanMooch SPF 50 PA+++Sunscreen Body Lotion |...
                              ...                        
237     N2B 100% Whey Gold Protein Supplement Powder W...
1836    mCaffeine Summer essential Skincare Kit for Ex...
65      AS-IT-IS Nutrition Pea Protein Isolate-Designe...
421                               NURAMENT original Cream
370     Ayurkosh Guduchi (Giloy) Extract Capsules for ...
Name: Product Name, Length: 2227, dtype: object

In [11]:
health_labels

Unnamed: 0,Discount Prices(?),Actual Prices(?),Weight/Type,Discount Percentage(%),Rating,Users
832,360,270,450 ml,20,4.2,3853
1451,205,500,40 ml,45,4.2,51
491,263,300,15 ml,8,4.4,565
1469,943,1694,3 Items in the set,87,4.2,24378
2310,562,499,200 ml,9,4.5,122214
...,...,...,...,...,...,...
237,1154,15549,"2 kg, Gourmet Chocolate",52,4.0,442
1836,314,435,1 Items in the set,16,4.3,43936
65,586,827,"500 g, Unflavored",57,4.2,858
421,280,380,2 x 20 g,37,3.9,13397


## Enhancing our feature for our ML train

In [12]:
#Enhancing our feature for our ML train
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000)
health_feature_update = tfidf.fit_transform(health_feature)


In [13]:
health_feature_update.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Train and Predict The MODEL

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

# Drop non-numeric columns like 'Weight/Type' if needed
health_labels = health_labels.drop('Weight/Type', axis=1)

# Train the model
model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
model.fit(health_feature_update, health_labels)


## Predict based on user-entered product name

In [15]:
# Predict based on user-entered product name
user_input = input("Enter the Product Name: ")
X_input = tfidf.transform([user_input])
prediction = model.predict(X_input)[0]

# Show the predicted output
for col, val in zip(health_labels.columns, prediction):
    print(f"{col}: {round(val, 2)}")


Enter the Product Name:  Multani Pachmeena Tonik Digestive Syrup | Reli...


Discount Prices(?): 604.21
Actual Prices(?): 696.77
Discount Percentage(%): 33.64
Rating: 3.99
Users: 1714.35


# Understanding this Upper code step by step
#Predict based on user-entered product name
user_input = input("Enter the Product Name: ")
X_input = tfidf.transform([user_input])
prediction = model.predict(X_input)[0]

#Show the predicted output
for col, val in zip(health_labels.columns, prediction):
    print(f"{col}: {round(val, 2)}")

### understand this part of code
# 1. Get user input
1.user_input = input("Enter the Product Name: ")
>What it does:
This prompts the user to type a product name (like: "ON Whey Protein 2kg Chocolate").
The input is saved as a string in the variable user_input.

2.Convert product name to numerical features using trained TF-IDF
X_input = tfidf.transform([user_input])
>What it does:
You take the product name string and convert it into a TF-IDF vector (i.e., a numerical format).
transform([user_input]) returns a sparse matrix with shape (1, 1000) because you used max_features=1000 in training.
This step ensures the model gets input in the same format it was trained on.

3.Make prediction using the trained model
prediction = model.predict(X_input)[0]
>What it does:
This uses your trained RandomForestRegressor wrapped in a MultiOutputRegressor to predict multiple values (like price, rating, users, etc.) from the input.
.predict() returns a 2D array, shape (1, n_outputs) → so [0] gets the actual prediction as a 1D array.

4.Display the predicted output
for col, val in zip(health_labels.columns, prediction):
    print(f"{col}: {round(val, 2)}")
>What it does:
zip() pairs each predicted value (val) with the corresponding column name (col) from health_labels.columns.
round(val, 2) rounds each number to two decimal places for cleaner output.
Then it prints something like:

5.Output
#Discount Prices(?): 1199.0
#Actual Prices(?): 2299.0
#Discount Percentage(%): 48.0
#Rating: 4.3
#Users: 23421.0







### when we train our ML in that time we converting our feature Categrical Value ->Numerical Value(bcz of our ml only train on numerical value),
## so i again coonvert user input data to numerical for prediction,(if we don't do that it shows error