In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.metrics  import silhouette_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [20]:
df = pd.read_csv("laptops.csv")
df

Unnamed: 0,brand,laptop_name,display_size,processor_type,graphics_card,disk_space,discount_price,old_price,ratings_5max
0,HP,Notebook 14-df0008nx,14.0,Intel Celeron N4000,Intel HD Graphics 600,64 GB (eMMC),1259.0,1259.0,0 / 5
1,Lenovo,IdeaPad 330S-14IKB,14.0,Intel Core i5-8250U,Intel UHD Graphics 620,1 TB HDD,1849.0,2099.0,3.3 / 5
2,Huawei,MateBook D Volta,14.0,Intel Core i5-8250U,NVIDIA GeForce MX150 (2 GB),256 GB SSD,2999.0,3799.0,0 / 5
3,Dell,Inspiron 15 3567,15.6,Intel Core i3-7020U,Intel HD Graphics 620,1 TB HDD,1849.0,1849.0,0 / 5
4,Asus,VivoBook 15 X510UR,15.6,Intel Core i7-8550U,NVIDIA GeForce 930MX (2 GB),1 TB HDD,2499.0,3149.0,0 / 5
...,...,...,...,...,...,...,...,...,...
200,Lenovo,IdeaPad 320-15IKBRN,15.6,Intel Core i5-8250U,Intel GMA HD,1 TB HDD,2099.0,2099.0,3.8 / 5
201,Huawei,MateBook D,15.6,Intel Core i7-8550U,NVIDIA GeForce MX150 (2 GB),128 GB SSD/1 TB HDD,3299.0,3299.0,4.0 / 5
202,Apple,MacBook Pro (Retina + Touch Bar),15.4,Intel Core i7 6 Core,Radeon Pro 555X GDDR5 (4 GB),256 GB SSD,10199.0,10199.0,0 / 5
203,Apple,MacBook Pro (Retina + Touch Bar),15.4,Intel Core i7 6 Core,Radeon Pro 560X GDDR5 (4 GB),512 GB SSD,11899.0,11899.0,4.4 / 5


## Data Preprocessing

In [21]:
df.isnull().sum()

brand             0
laptop_name       1
display_size      0
processor_type    0
graphics_card     0
disk_space        0
discount_price    0
old_price         0
ratings_5max      0
dtype: int64

Dropping the null values in the dataset as there is only one null value in whole.

In [22]:
df.dropna(inplace=True)

Copying the values in the new dataset, so that the original is not changed

In [23]:
df2 = df.copy(deep=True)

Getting the absolute ratings of the laptops

In [24]:
df2['ratings_5max'] = df["ratings_5max"].apply(lambda x: float(x.split()[0]))
df2

Unnamed: 0,brand,laptop_name,display_size,processor_type,graphics_card,disk_space,discount_price,old_price,ratings_5max
0,HP,Notebook 14-df0008nx,14.0,Intel Celeron N4000,Intel HD Graphics 600,64 GB (eMMC),1259.0,1259.0,0.0
1,Lenovo,IdeaPad 330S-14IKB,14.0,Intel Core i5-8250U,Intel UHD Graphics 620,1 TB HDD,1849.0,2099.0,3.3
2,Huawei,MateBook D Volta,14.0,Intel Core i5-8250U,NVIDIA GeForce MX150 (2 GB),256 GB SSD,2999.0,3799.0,0.0
3,Dell,Inspiron 15 3567,15.6,Intel Core i3-7020U,Intel HD Graphics 620,1 TB HDD,1849.0,1849.0,0.0
4,Asus,VivoBook 15 X510UR,15.6,Intel Core i7-8550U,NVIDIA GeForce 930MX (2 GB),1 TB HDD,2499.0,3149.0,0.0
...,...,...,...,...,...,...,...,...,...
200,Lenovo,IdeaPad 320-15IKBRN,15.6,Intel Core i5-8250U,Intel GMA HD,1 TB HDD,2099.0,2099.0,3.8
201,Huawei,MateBook D,15.6,Intel Core i7-8550U,NVIDIA GeForce MX150 (2 GB),128 GB SSD/1 TB HDD,3299.0,3299.0,4.0
202,Apple,MacBook Pro (Retina + Touch Bar),15.4,Intel Core i7 6 Core,Radeon Pro 555X GDDR5 (4 GB),256 GB SSD,10199.0,10199.0,0.0
203,Apple,MacBook Pro (Retina + Touch Bar),15.4,Intel Core i7 6 Core,Radeon Pro 560X GDDR5 (4 GB),512 GB SSD,11899.0,11899.0,4.4


Disk space has two segments i.e; size of the disk and its type. Aim to convert whole column in terms of GB's.

In [25]:
df2.disk_space.apply(lambda x: x.split()[-1]).unique()

array(['(eMMC)', 'HDD', 'SSD', 'SSD)', 'Flash)'], dtype=object)

Correcting the typo with SSD and Flash.

In [26]:
typeDict = {'(eMMC)': 'eMMC', 'HDD': 'HDD', 'SSD': 'SSD', 'SSD)': 'SSD', 'Flash)': 'Flash'}
df2['disk_type'] = df2['disk_space'].map(lambda x: typeDict[x.split()[-1]])
df2['disk_space'] = df2['disk_space'].apply(lambda x: float(x.split()[0])*1024 if x.split()[1] == 'TB' else float(x.split()[0]))

In [27]:
df2['disk_type'].unique()

array(['eMMC', 'HDD', 'SSD', 'Flash'], dtype=object)

## Replacing the disk type with encoded numbers.

In [28]:
df2['disk_type'] = df2['disk_type'].replace('eMMC',1,regex=True)
df2['disk_type'] = df2['disk_type'].replace('HDD',2,regex=True)
df2['disk_type'] = df2['disk_type'].replace('SSD',3,regex=True)
df2['disk_type'] = df2['disk_type'].replace('Flash',4,regex=True)

## Note

The display size in the dataset is in inches.

Checking the number of unique brands.

In [29]:
df2['brand'].unique()

array(['HP', 'Lenovo', 'Huawei', 'Dell', 'Asus', 'Apple', 'Acer',
       'Microsoft', 'MSI'], dtype=object)

Dropping the uneccessary columns for the prediction model.

In [30]:
df2 = df2.drop(['brand', 'laptop_name','processor_type','graphics_card','disk_type'], axis=1)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df2)

Finding the optimal number of clusters using the silhoutter scores.

In [31]:
silhouette_scores = []
for n_clusters in range(2, 11):
    kmeans = KMeans(n_clusters=n_clusters)
    cluster_labels = kmeans.fit_predict(df_scaled)
    silhouette_scores.append(silhouette_score(df_scaled, cluster_labels))

optimal_n_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
print(f"The optimal number of clusters is {optimal_n_clusters}")

The optimal number of clusters is 9


In [32]:
kmeans = KMeans(n_clusters=10)
cluster_labels = kmeans.fit_predict(df_scaled)

In [None]:
df2['Cluster'] = cluster_labels
df2

Recommending the laptop based on user preferences.<br>
For example : if user prefers's a laptop with a 14 inch screen size, an above 3-star rating and disk space of atleast 32GB.<br><br>
Creating a user preference dataframe.

In [49]:
user_preferences = pd.DataFrame({
    'display_size': [14],
    'disk_space':[32],
    'discount_price':[10000],
    'old_price': [20000],
    'ratings_5max':[3],   
})

In [50]:
df2.head()

Unnamed: 0,display_size,disk_space,discount_price,old_price,ratings_5max,Cluster
0,14.0,64.0,1259.0,1259.0,0.0,4
1,14.0,1024.0,1849.0,2099.0,3.3,7
2,14.0,256.0,2999.0,3799.0,0.0,0
3,15.6,1024.0,1849.0,1849.0,0.0,1
4,15.6,1024.0,2499.0,3149.0,0.0,1


Scaling the user preference dataframe.

In [51]:
user_preferences_scaled = scaler.transform(user_preferences)

In [52]:
cluster = kmeans.predict(user_preferences_scaled)[0]

Recommending the laptops based on user preference.

In [53]:
recommended_laptops = df[df2['Cluster'] == cluster]
recommended_product = pd.DataFrame(recommended_laptops)
recommended_product

Unnamed: 0,brand,laptop_name,display_size,processor_type,graphics_card,disk_space,discount_price,old_price,ratings_5max
14,Apple,MacBook Pro (Retina + Touch Bar),15.4,Intel Core i7 6 Core,Radeon Pro 555X GDDR5 (4 GB),256 GB SSD,9099.0,10199.0,0 / 5
50,Dell,XPS 15 9570,15.6,Intel Core i7-8750H,NVIDIA GeForce GTX 1050 Ti (4 GB),128 GB M.2 SSD/1 TB HDD,6099.0,6499.0,0 / 5
131,Apple,MacBook Pro (Retina + Touch Bar),15.4,Intel Core i7 6 Core,Radeon Pro 555X GDDR5 (4 GB),256 GB SSD,10799.0,10799.0,0 / 5
132,Apple,MacBook Pro (Retina + Touch Bar),15.4,Intel Core i9,Radeon Pro 560X GDDR5 (4 GB),512 GB SSD,12499.0,12499.0,0 / 5
135,MSI,GT83VR 7RF Titan SLI,18.4,Intel Core i7-7820HK,NVIDIA GeForce GTX 1080 (8 GB) SLI,256 GB SSD (Super Raid)/1 TB HDD,9071.0,9071.0,0 / 5
136,Asus,ROG G752VS,17.3,Intel Core i7-7700HQ,NVIDIA GeForce GTX 1070 (8 GB),256 GB SSD/1 TB HDD,8189.0,8189.0,0 / 5
137,Acer,Predator 17 GX791 78ND,17.3,Intel Core i7-6700HQ,NVIDIA GeForce GTX 980M (8 GB),256 GB SSD/1 TB HDD,7507.0,7507.0,0 / 5
138,Apple,MacBook Pro (Retina),15.4,Intel Core i7 Quad Core,AMD Radeon R9-M370X (2 GB),512 GB SSD,6824.0,6824.0,0 / 5
139,Dell,Alienware 15,15.6,Intel Core i7-7820HK,NVIDIA GeForce GTX 1080 (8 GB),256 GB SSD/1 TB HDD,6688.0,6688.0,3.7 / 5
141,Apple,MacBook Pro (Retina + Touch Bar),15.4,Intel Core i7 Quad Core,Radeon Pro 555 GDDR5 (2 GB),256 GB SSD,7679.0,7679.0,4.6 / 5


In [None]:
[['brand', 'laptop_name','processor_type','graphics_card','display_size']]