In [204]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.cluster import KMeans

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

from sklearn import set_config
set_config(display="diagram")

In [205]:
# MultiLabel Binarizer for categorical columns
class CustomMultiLabelBinarizer(MultiLabelBinarizer):
    def __init__(self):
        super().__init__()
        self.classes = None
        
    def fit(self, y):
        return super().fit(y)
    
    def transform(self, y):
        return super().transform(y)
    
    def fit_transform(self, X, y):
        return super().fit_transform(X)


In [211]:
data = pd.read_csv("../data/preprocessed.csv", index_col=0)
data.head()

Unnamed: 0,Rating,MRP,Processor,RAM,OS,Storage,Brand,OS_arch,RAM_Type
0,4.2,25490.0,Intel Celeron Dual Core Processor,4 GB,Windows 11,['512 GB SSD'],HP,64,LPDDR4
1,3.8,22490.0,Intel Core i5 Processor,4 GB,Windows 11,['512 GB SSD'],HP,64,LPDDR4X
2,4.1,25685.0,Intel Celeron Dual Core Processor,8 GB,Windows 11,['256 GB SSD'],Lenovo,64,DDR4
3,4.4,51990.0,Intel Core i5 Processor,8 GB,Windows 11,['512 GB SSD'],ASUS,64,DDR4
4,4.2,38990.0,Intel Core i3 Processor,8 GB,Windows 11,['512 GB SSD'],ASUS,64,DDR4


In [212]:
data.Storage = data.Storage.apply(lambda x: tuple(eval(x)))

# Price Estimation

In [151]:
data.drop(["Rating", "Brand"], axis=1, inplace=True)

In [152]:
X, y = data.drop("MRP", axis=1), data.MRP

In [153]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

In [154]:
xtrain.head()

Unnamed: 0,Processor,RAM,OS,Storage,OS_arch,RAM_Type
411,Intel Celeron Dual Core Processor,4 GB,Windows 11,['512 GB SSD'],64,LPDDR4
108,Intel Core i5 Processor,8 GB,Windows 11,['256 GB SSD'],64,DDR4
408,Intel Core i5 Processor,8 GB,Windows 11,['512 GB SSD'],64,DDR4
357,Intel Core i9 Processor,32 GB,Windows 11,['1 TB SSD'],64,DDR5
628,Intel Core i5 Processor,8 GB,Windows 11,['512 GB SSD'],64,DDR4


In [155]:
cmb = CustomMultiLabelBinarizer()
cmb.fit_transform(data.Storage.values, [])

array([[1, 1, 0, ..., 0, 1, 1],
       [1, 1, 0, ..., 0, 1, 1],
       [1, 1, 0, ..., 0, 1, 1],
       ...,
       [1, 1, 0, ..., 0, 1, 1],
       [1, 1, 0, ..., 0, 1, 1],
       [1, 1, 0, ..., 0, 1, 1]])

In [156]:
list(data.RAM.sort_values()[::-1].unique())

['8 GB', '4 GB', '32 GB', '16 GB']

In [157]:

pipeline = Pipeline([
    ("col_transformer_ohe", ColumnTransformer([
        ("ohe", OneHotEncoder(handle_unknown="ignore"), [0,2,5]),
        ("ord", OrdinalEncoder(categories=[['8 GB', '4 GB', '32 GB', '16 GB'], [32, 64]]), [1,4]),
        ("binarizer", CustomMultiLabelBinarizer(), [3])
    ], remainder="passthrough")),
    ("estimator", LinearRegression())
])
pipeline.fit(xtrain.values, ytrain.values)

In [158]:
pipeline.score(xtest.values, ytest.values)



-1.5901276372680328e+25

In [136]:
sample = [["Core i5", 16, "Windows 11", "512 GB SSD", 64, "DDR4"]]
int(pipeline.predict(sample))



1827751429885667328

## save model with full data

In [159]:
pipeline = Pipeline([
    ("col_transformer_ohe", ColumnTransformer([
        ("ohe", OneHotEncoder(handle_unknown="ignore"), [0,1,2,4,5]),
        ("binarizer", CustomMultiLabelBinarizer(), [3])
    ], remainder="passthrough")),
    ("estimator", LinearRegression())
])

X,y = data.drop("MRP", axis=1), data.MRP
pipeline.fit(X.values,y.values)

In [166]:
pipeline.predict([['AMD Ryzen 5 Hexa Core Processor', '8 GB', 'Windows 10',
       "['512 GB SSD']", 64, 'DDR4']])[0]

58624.0

In [167]:
joblib.dump(pipeline, "../model/model.obj")

['../model/model.obj']

# Laptop Recommendation

In [213]:
data.drop("Brand", axis=1, inplace=True)
data.head()

Unnamed: 0,Rating,MRP,Processor,RAM,OS,Storage,OS_arch,RAM_Type
0,4.2,25490.0,Intel Celeron Dual Core Processor,4 GB,Windows 11,"(512 GB SSD,)",64,LPDDR4
1,3.8,22490.0,Intel Core i5 Processor,4 GB,Windows 11,"(512 GB SSD,)",64,LPDDR4X
2,4.1,25685.0,Intel Celeron Dual Core Processor,8 GB,Windows 11,"(256 GB SSD,)",64,DDR4
3,4.4,51990.0,Intel Core i5 Processor,8 GB,Windows 11,"(512 GB SSD,)",64,DDR4
4,4.2,38990.0,Intel Core i3 Processor,8 GB,Windows 11,"(512 GB SSD,)",64,DDR4


In [214]:
list(enumerate(data.columns))

[(0, 'Rating'),
 (1, 'MRP'),
 (2, 'Processor'),
 (3, 'RAM'),
 (4, 'OS'),
 (5, 'Storage'),
 (6, 'OS_arch'),
 (7, 'RAM_Type')]

In [215]:
pipeline = Pipeline([
    ("preprocessor", ColumnTransformer([
        ("ohe", OneHotEncoder(handle_unknown="ignore"), [2, 4, 7]),
        ("ord", OrdinalEncoder(categories=[['8 GB', '4 GB', '32 GB', '16 GB'], [32, 64]]), [3, 6]),
        ("binarizer", CustomMultiLabelBinarizer(), [5]),
        ("sc", StandardScaler(with_mean=False), [0, 1])
    ], remainder="passthrough")),
    ("recommender", KMeans(n_init="auto"))
])

pipeline.fit(data.values)

In [216]:
data["cluster"] = pipeline.predict(data.values)
data

Unnamed: 0,Rating,MRP,Processor,RAM,OS,Storage,OS_arch,RAM_Type,cluster
0,4.2,25490.0,Intel Celeron Dual Core Processor,4 GB,Windows 11,"(512 GB SSD,)",64,LPDDR4,3
1,3.8,22490.0,Intel Core i5 Processor,4 GB,Windows 11,"(512 GB SSD,)",64,LPDDR4X,6
2,4.1,25685.0,Intel Celeron Dual Core Processor,8 GB,Windows 11,"(256 GB SSD,)",64,DDR4,1
3,4.4,51990.0,Intel Core i5 Processor,8 GB,Windows 11,"(512 GB SSD,)",64,DDR4,5
4,4.2,38990.0,Intel Core i3 Processor,8 GB,Windows 11,"(512 GB SSD,)",64,DDR4,5
...,...,...,...,...,...,...,...,...,...
715,4.2,33639.0,Intel Core i3 Processor,8 GB,Windows 11,"(256 GB SSD,)",64,DDR4,1
716,4.2,46990.0,AMD Ryzen 5 Quad Core Processor,8 GB,Windows 11,"(512 GB SSD,)",64,LPDDR5,5
717,4.6,102490.0,Intel Core i5 Processor,16 GB,Windows 10,"(512 GB SSD,)",64,DDR4,0
718,4.2,174990.0,Intel Core i7 Processor,16 GB,Windows 11,"(512 GB SSD,)",64,DDR5,2


In [243]:
class Recommender:
    def __init__(self, recommender):
        self.recommender = recommender
        
    def predict(self, data):
        return self.recommender.predict(data)
    
    def get_k_recommendations(self, k, cluster):
        recommendations = data[data.cluster == cluster]
        if recommendations.shape[0] > 0:
            recommendations = recommendations.sample(k)
            recommendations.index = range(1, k+1)
        return recommendations.iloc[:,:-1]

recommender = Recommender(pipeline)

In [244]:
joblib.dump(recommender, "../model/recommender.obj")

['../model/recommender.obj']

In [250]:
!ls ../model

model.obj  recommender.obj


In [246]:
data.head()

Unnamed: 0,Rating,MRP,Processor,RAM,OS,Storage,OS_arch,RAM_Type,cluster
0,4.2,25490.0,Intel Celeron Dual Core Processor,4 GB,Windows 11,"(512 GB SSD,)",64,LPDDR4,3
1,3.8,22490.0,Intel Core i5 Processor,4 GB,Windows 11,"(512 GB SSD,)",64,LPDDR4X,6
2,4.1,25685.0,Intel Celeron Dual Core Processor,8 GB,Windows 11,"(256 GB SSD,)",64,DDR4,1
3,4.4,51990.0,Intel Core i5 Processor,8 GB,Windows 11,"(512 GB SSD,)",64,DDR4,5
4,4.2,38990.0,Intel Core i3 Processor,8 GB,Windows 11,"(512 GB SSD,)",64,DDR4,5


In [224]:
data.to_csv("../data/final.csv")

In [247]:
recommender.get_k_recommendations(5, 4)

Unnamed: 0,Rating,MRP,Processor,RAM,OS,Storage,OS_arch,RAM_Type
1,4.2,246490.0,Intel Core i7 Processor,32 GB,Windows 11,"(1 TB SSD,)",64,DDR5
2,4.2,249900.0,Intel Core i5 Processor,16 GB,Windows 11,"(1 TB SSD,)",64,Unified Memory
3,4.2,301818.0,Intel Core i9 Processor,32 GB,Windows 11,"(1 TB SSD,)",64,DDR5
4,4.2,281990.0,Intel Core i9 Processor,32 GB,Windows 11,"(1 TB SSD,)",64,DDR5
5,4.2,349900.0,Intel Core i5 Processor,32 GB,Windows 11,"(1 TB SSD,)",64,Unified Memory
