In [118]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import cluster, datasets, mixture

from sklearn.metrics import silhouette_score,davies_bouldin_score,calinski_harabasz_score

from sklearn.cluster import KMeans,AgglomerativeClustering,DBSCAN

from sklearn.manifold import TSNE
#encoding
from sklearn.preprocessing import OneHotEncoder,LabelEncoder

In [119]:
df=pd.read_csv("../../data/modelers_data.csv")
df

Unnamed: 0,index,brand,Model,Price,Rating,processor_brand,processor_tier,num_cores,num_threads,ram_memory,...,secondary_storage_type,secondary_storage_capacity,gpu_brand,gpu_type,is_touch_screen,display_size,resolution_width,resolution_height,OS,year_of_warranty
0,949,msi,MSI Modern 15 B7M-072IN Laptop (Ryzen 5 7530U/...,43990,53,amd,ryzen 5,6,12,8,...,No secondary storage,0,amd,integrated,False,15.6,1920,1080,windows,1
1,584,msi,MSI Prestige 16 A13UCX-250IN Laptop (13th Gen ...,119990,73,intel,core i7,14,20,16,...,No secondary storage,0,nvidia,dedicated,False,16.0,2560,1600,windows,2
2,937,acer,Acer Aspire 7 A715-75G Gaming Laptop (10th Gen...,45990,59,intel,core i5,4,8,8,...,No secondary storage,0,nvidia,dedicated,False,15.6,1920,1080,windows,1
3,416,msi,MSI CreatorPro Z16 HX B13VKTO-214IN Laptop (13...,449990,89,intel,core i9,24,32,4,...,No secondary storage,0,nvidia,dedicated,False,16.0,2560,1600,windows,2
4,538,acer,Acer Aspire 3 Spin 14 NX.KENSI.002 Laptop (Int...,41490,45,intel,core i3,8,8,8,...,No secondary storage,0,intel,integrated,False,14.0,1920,1200,windows,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,114,acer,Acer Aspire Lite AL15 Laptop (AMD Ryzen 5 5500...,37990,60,amd,ryzen 5,6,12,16,...,No secondary storage,0,amd,integrated,False,15.6,1920,1080,windows,1
689,281,hp,HP Victus 15-fb0050AX Gaming Laptop (AMD Ryzen...,59871,71,amd,ryzen 5,6,12,8,...,No secondary storage,0,nvidia,dedicated,False,15.6,1920,1080,windows,1
690,872,dell,Dell Vostro 15 3510 2023 Laptop (11th Gen Core...,47990,63,intel,core i5,4,8,8,...,No secondary storage,0,intel,integrated,False,15.6,1920,1080,windows,1
691,447,hp,HP 240 G9 821J4PA Laptop (12th Gen Core i5/ 8G...,74999,64,intel,core i5,10,12,8,...,No secondary storage,0,intel,integrated,False,14.0,1366,768,windows,1


## Plan for encoding
# Columns to encode:
- brand
- columns that will be created from the Model column
- - We decided not to encode exact os, because it is too detailed and there are too many different values and it does not seem to be very important and give us much information
- - We decided not to encode generation of processor, because it was too problematic and we had only partial information about it
- - We just focused on encoding exact model name here.
- processor brand
- processor tier
- primary_storage_type
- secondary_storage_type
- gpu_brand
- gpu_type
- is_touch_screen (binary encoding)
- OS

In [120]:

def extract_info_from_Model_column(df):
    # take first word of the model column
    df2=df.copy()
    df2['brand_from_model']=df2['Model'].str.split(' ').str[0]
    # take rest until '('
    df2['model_rest']=df2['Model'].str.split('(').str[0]
    # remove first word in model_rest and to string
    df2['model_rest']=df2['model_rest'].str.split(' ').str[1:].str.join(' ')
    # first word in model_rest is exact_model
    df2['exact_model']=df2['model_rest'].str.split(' ').str[0]
    # in ( ) take every fragment that ends with / or )
    df2['processor_exact_info']=df2['Model'].str.extract(r'\(([^/)]*)')
    # take next word after /
    df2['ram_exact_info']=df2['Model'].str.split('/').str[1].str.split(' ').str[1]
    # take next fragment after / until /
    df2['disk_exact_info']=df2['Model'].str.split('/').str[2]
    # take next fragment after / until / or )
    df2['os_exact_info']=df2['Model'].str.split('/').str[3].str.split(')').str[0]
    # if there is something after / and before take it
    df2['other_info']=df2['Model'].str.split('/').str[4].str.split(')').str[0]
    df2=df2.drop(columns=["brand_from_model", "other_info", "disk_exact_info", "ram_exact_info", "model_rest", "index", "os_exact_info", 'processor_exact_info'])
    return df2

df2=extract_info_from_Model_column(df)
df2


Unnamed: 0,brand,Model,Price,Rating,processor_brand,processor_tier,num_cores,num_threads,ram_memory,primary_storage_type,...,secondary_storage_capacity,gpu_brand,gpu_type,is_touch_screen,display_size,resolution_width,resolution_height,OS,year_of_warranty,exact_model
0,msi,MSI Modern 15 B7M-072IN Laptop (Ryzen 5 7530U/...,43990,53,amd,ryzen 5,6,12,8,SSD,...,0,amd,integrated,False,15.6,1920,1080,windows,1,Modern
1,msi,MSI Prestige 16 A13UCX-250IN Laptop (13th Gen ...,119990,73,intel,core i7,14,20,16,SSD,...,0,nvidia,dedicated,False,16.0,2560,1600,windows,2,Prestige
2,acer,Acer Aspire 7 A715-75G Gaming Laptop (10th Gen...,45990,59,intel,core i5,4,8,8,SSD,...,0,nvidia,dedicated,False,15.6,1920,1080,windows,1,Aspire
3,msi,MSI CreatorPro Z16 HX B13VKTO-214IN Laptop (13...,449990,89,intel,core i9,24,32,4,SSD,...,0,nvidia,dedicated,False,16.0,2560,1600,windows,2,CreatorPro
4,acer,Acer Aspire 3 Spin 14 NX.KENSI.002 Laptop (Int...,41490,45,intel,core i3,8,8,8,SSD,...,0,intel,integrated,False,14.0,1920,1200,windows,1,Aspire
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,acer,Acer Aspire Lite AL15 Laptop (AMD Ryzen 5 5500...,37990,60,amd,ryzen 5,6,12,16,SSD,...,0,amd,integrated,False,15.6,1920,1080,windows,1,Aspire
689,hp,HP Victus 15-fb0050AX Gaming Laptop (AMD Ryzen...,59871,71,amd,ryzen 5,6,12,8,SSD,...,0,nvidia,dedicated,False,15.6,1920,1080,windows,1,Victus
690,dell,Dell Vostro 15 3510 2023 Laptop (11th Gen Core...,47990,63,intel,core i5,4,8,8,SSD,...,0,intel,integrated,False,15.6,1920,1080,windows,1,Vostro
691,hp,HP 240 G9 821J4PA Laptop (12th Gen Core i5/ 8G...,74999,64,intel,core i5,10,12,8,SSD,...,0,intel,integrated,False,14.0,1366,768,windows,1,240


In [121]:
df2['exact_model'].value_counts()

exact_model
Vivobook             82
IdeaPad              67
Inspiron             42
Aspire               33
Victus               30
                     ..
15s-eq2223AU          1
15s-fq5007TU          1
15s-fr4000TU          1
NU14A1                1
Victus15-fb0147AX     1
Name: count, Length: 130, dtype: int64

In [122]:
def encode_exact_model(df):
    o_hot=OneHotEncoder()
    #if model names appear only 3 or less, they will be encoded as other
    df['exact_model']=np.where(df['exact_model'].map(df['exact_model'].value_counts())>3,df['exact_model'],'other')
    new_columns = o_hot.fit_transform(df[['exact_model']]).toarray()
    #change new_columns to ints
    new_columns=new_columns.astype(int)
    new_columns=pd.DataFrame(new_columns,columns=o_hot.categories_[0])
    df=pd.concat([df,new_columns],axis=1)
    df=df.drop(columns=['Model', 'exact_model'])
    # take information how to decode
    return df

df2=encode_exact_model(df2)
df2

Unnamed: 0,brand,Price,Rating,processor_brand,processor_tier,num_cores,num_threads,ram_memory,primary_storage_type,primary_storage_capacity,...,Thinkpad,V14,V15,Victus,VivoBook,Vivobook,Vostro,Yoga,Zenbook,other
0,msi,43990,53,amd,ryzen 5,6,12,8,SSD,512,...,0,0,0,0,0,0,0,0,0,0
1,msi,119990,73,intel,core i7,14,20,16,SSD,1024,...,0,0,0,0,0,0,0,0,0,1
2,acer,45990,59,intel,core i5,4,8,8,SSD,512,...,0,0,0,0,0,0,0,0,0,0
3,msi,449990,89,intel,core i9,24,32,4,SSD,2048,...,0,0,0,0,0,0,0,0,0,1
4,acer,41490,45,intel,core i3,8,8,8,SSD,512,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,acer,37990,60,amd,ryzen 5,6,12,16,SSD,512,...,0,0,0,0,0,0,0,0,0,0
689,hp,59871,71,amd,ryzen 5,6,12,8,SSD,512,...,0,0,0,1,0,0,0,0,0,0
690,dell,47990,63,intel,core i5,4,8,8,SSD,512,...,0,0,0,0,0,0,1,0,0,0
691,hp,74999,64,intel,core i5,10,12,8,SSD,512,...,0,0,0,0,0,0,0,0,0,1


In [123]:
df2['exact_model'].value_counts()

KeyError: 'exact_model'

In [124]:
def encode_brand(df):
    o_hot=OneHotEncoder()
    new_columns = o_hot.fit_transform(df[['brand']]).toarray()
    #change new_columns to ints
    new_columns=new_columns.astype(int)
    new_columns=pd.DataFrame(new_columns,columns=o_hot.categories_[0])
    df=pd.concat([df,new_columns],axis=1)
    df=df.drop(columns=['brand'])
    # take information how to decode
    return df

df2=encode_brand(df2)
df2

Unnamed: 0,Price,Rating,processor_brand,processor_tier,num_cores,num_threads,ram_memory,primary_storage_type,primary_storage_capacity,secondary_storage_type,...,microsoft,msi,primebook,realme,samsung,tecno,ultimus,walker,wings,zebronics
0,43990,53,amd,ryzen 5,6,12,8,SSD,512,No secondary storage,...,0,1,0,0,0,0,0,0,0,0
1,119990,73,intel,core i7,14,20,16,SSD,1024,No secondary storage,...,0,1,0,0,0,0,0,0,0,0
2,45990,59,intel,core i5,4,8,8,SSD,512,No secondary storage,...,0,0,0,0,0,0,0,0,0,0
3,449990,89,intel,core i9,24,32,4,SSD,2048,No secondary storage,...,0,1,0,0,0,0,0,0,0,0
4,41490,45,intel,core i3,8,8,8,SSD,512,No secondary storage,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,amd,ryzen 5,6,12,16,SSD,512,No secondary storage,...,0,0,0,0,0,0,0,0,0,0
689,59871,71,amd,ryzen 5,6,12,8,SSD,512,No secondary storage,...,0,0,0,0,0,0,0,0,0,0
690,47990,63,intel,core i5,4,8,8,SSD,512,No secondary storage,...,0,0,0,0,0,0,0,0,0,0
691,74999,64,intel,core i5,10,12,8,SSD,512,No secondary storage,...,0,0,0,0,0,0,0,0,0,0


In [125]:
def encode_processor_brand(df):
    o_hot=OneHotEncoder()
    new_columns = o_hot.fit_transform(df[['processor_brand']]).toarray()
    #change new_columns to ints
    new_columns=new_columns.astype(int)
    new_columns=pd.DataFrame(new_columns,columns=o_hot.categories_[0])
    df=pd.concat([df,new_columns],axis=1)
    df=df.drop(columns=['processor_brand'])
    # take information how to decode
    return df

df2=encode_processor_brand(df2)
df2

Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_type,primary_storage_capacity,secondary_storage_type,secondary_storage_capacity,...,samsung,tecno,ultimus,walker,wings,zebronics,amd,apple,intel,other
0,43990,53,ryzen 5,6,12,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
1,119990,73,core i7,14,20,16,SSD,1024,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
2,45990,59,core i5,4,8,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
3,449990,89,core i9,24,32,4,SSD,2048,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
4,41490,45,core i3,8,8,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,ryzen 5,6,12,16,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
689,59871,71,ryzen 5,6,12,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
690,47990,63,core i5,4,8,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
691,74999,64,core i5,10,12,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0


In [126]:
df['processor_tier'].value_counts()

processor_tier
core i5         231
core i7         113
ryzen 5         100
core i3          95
ryzen 7          59
celeron          25
core i9          23
ryzen 3          21
ryzen 9           8
other             7
m3                6
m2                3
core ultra 7      1
pentium           1
Name: count, dtype: int64

In [127]:
def encode_processor_tier(df):
    new_column=df['processor_tier'].str.extract(r'(\d)')
    #if there is no number, it will be encoded as 0
    new_column=new_column.fillna(0)
    df['processor_tier']=new_column
    return df

df2=encode_processor_tier(df2)
df2

Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_type,primary_storage_capacity,secondary_storage_type,secondary_storage_capacity,...,samsung,tecno,ultimus,walker,wings,zebronics,amd,apple,intel,other
0,43990,53,5,6,12,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
1,119990,73,7,14,20,16,SSD,1024,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
2,45990,59,5,4,8,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
3,449990,89,9,24,32,4,SSD,2048,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
4,41490,45,3,8,8,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,5,6,12,16,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
689,59871,71,5,6,12,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
690,47990,63,5,4,8,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
691,74999,64,5,10,12,8,SSD,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0


In [128]:
def encode_primary_storage_type(df):
    # if SSD 1, if HDD 0
    df['primary_storage_type']=np.where(df['primary_storage_type']=='SSD',1,0)
    #change name of column to primary_storage_is_SSD
    df.rename(columns={'primary_storage_type':'primary_storage_is_SSD'},inplace=True)
    return df

df2=encode_primary_storage_type(df2)
df2

Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_is_SSD,primary_storage_capacity,secondary_storage_type,secondary_storage_capacity,...,samsung,tecno,ultimus,walker,wings,zebronics,amd,apple,intel,other
0,43990,53,5,6,12,8,1,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
1,119990,73,7,14,20,16,1,1024,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
2,45990,59,5,4,8,8,1,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
3,449990,89,9,24,32,4,1,2048,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
4,41490,45,3,8,8,8,1,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,5,6,12,16,1,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
689,59871,71,5,6,12,8,1,512,No secondary storage,0,...,0,0,0,0,0,0,1,0,0,0
690,47990,63,5,4,8,8,1,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0
691,74999,64,5,10,12,8,1,512,No secondary storage,0,...,0,0,0,0,0,0,0,0,1,0


In [129]:
df2['secondary_storage_type'].value_counts()

secondary_storage_type
No secondary storage    683
SSD                      10
Name: count, dtype: int64

In [130]:
def encode_secondary_storage_type(df):
    # we can drop this column, because it is not very informative
    # we have second column that informs us about capacity of secondary storage and it is always SSD
    df=df.drop(columns=['secondary_storage_type'])
    return df

df2=encode_secondary_storage_type(df2)
df2


Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_is_SSD,primary_storage_capacity,secondary_storage_capacity,gpu_brand,...,samsung,tecno,ultimus,walker,wings,zebronics,amd,apple,intel,other
0,43990,53,5,6,12,8,1,512,0,amd,...,0,0,0,0,0,0,1,0,0,0
1,119990,73,7,14,20,16,1,1024,0,nvidia,...,0,0,0,0,0,0,0,0,1,0
2,45990,59,5,4,8,8,1,512,0,nvidia,...,0,0,0,0,0,0,0,0,1,0
3,449990,89,9,24,32,4,1,2048,0,nvidia,...,0,0,0,0,0,0,0,0,1,0
4,41490,45,3,8,8,8,1,512,0,intel,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,5,6,12,16,1,512,0,amd,...,0,0,0,0,0,0,1,0,0,0
689,59871,71,5,6,12,8,1,512,0,nvidia,...,0,0,0,0,0,0,1,0,0,0
690,47990,63,5,4,8,8,1,512,0,intel,...,0,0,0,0,0,0,0,0,1,0
691,74999,64,5,10,12,8,1,512,0,intel,...,0,0,0,0,0,0,0,0,1,0


In [131]:
df2['gpu_brand'].value_counts()

gpu_brand
intel     315
nvidia    247
amd       120
apple       9
arm         2
Name: count, dtype: int64

In [132]:
def encode_gpu_brand(df):
    o_hot=OneHotEncoder()
    new_columns = o_hot.fit_transform(df[['gpu_brand']]).toarray()
    #change new_columns to ints
    new_columns=new_columns.astype(int)
    new_columns=pd.DataFrame(new_columns,columns=o_hot.categories_[0])
    df=pd.concat([df,new_columns],axis=1)
    df=df.drop(columns=['gpu_brand'])
    # take information how to decode
    return df

df2=encode_gpu_brand(df2)
df2

Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_is_SSD,primary_storage_capacity,secondary_storage_capacity,gpu_type,...,zebronics,amd,apple,intel,other,amd.1,apple.1,arm,intel.1,nvidia
0,43990,53,5,6,12,8,1,512,0,integrated,...,0,1,0,0,0,1,0,0,0,0
1,119990,73,7,14,20,16,1,1024,0,dedicated,...,0,0,0,1,0,0,0,0,0,1
2,45990,59,5,4,8,8,1,512,0,dedicated,...,0,0,0,1,0,0,0,0,0,1
3,449990,89,9,24,32,4,1,2048,0,dedicated,...,0,0,0,1,0,0,0,0,0,1
4,41490,45,3,8,8,8,1,512,0,integrated,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,5,6,12,16,1,512,0,integrated,...,0,1,0,0,0,1,0,0,0,0
689,59871,71,5,6,12,8,1,512,0,dedicated,...,0,1,0,0,0,0,0,0,0,1
690,47990,63,5,4,8,8,1,512,0,integrated,...,0,0,0,1,0,0,0,0,1,0
691,74999,64,5,10,12,8,1,512,0,integrated,...,0,0,0,1,0,0,0,0,1,0


In [133]:
df2['gpu_type'].value_counts()

gpu_type
integrated    426
dedicated     258
apple           9
Name: count, dtype: int64

In [134]:
def encode_gpu_type(df):
    o_hot=OneHotEncoder()
    new_columns = o_hot.fit_transform(df[['gpu_type']]).toarray()
    #change new_columns to ints
    new_columns=new_columns.astype(int)
    new_columns=pd.DataFrame(new_columns,columns=o_hot.categories_[0])
    df=pd.concat([df,new_columns],axis=1)
    df=df.drop(columns=['gpu_type'])
    # take information how to decode
    return df

df2=encode_gpu_type(df2)
df2

Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_is_SSD,primary_storage_capacity,secondary_storage_capacity,is_touch_screen,...,intel,other,amd,apple,arm,intel.1,nvidia,apple.1,dedicated,integrated
0,43990,53,5,6,12,8,1,512,0,False,...,0,0,1,0,0,0,0,0,0,1
1,119990,73,7,14,20,16,1,1024,0,False,...,1,0,0,0,0,0,1,0,1,0
2,45990,59,5,4,8,8,1,512,0,False,...,1,0,0,0,0,0,1,0,1,0
3,449990,89,9,24,32,4,1,2048,0,False,...,1,0,0,0,0,0,1,0,1,0
4,41490,45,3,8,8,8,1,512,0,False,...,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,5,6,12,16,1,512,0,False,...,0,0,1,0,0,0,0,0,0,1
689,59871,71,5,6,12,8,1,512,0,False,...,0,0,0,0,0,0,1,0,1,0
690,47990,63,5,4,8,8,1,512,0,False,...,1,0,0,0,0,1,0,0,0,1
691,74999,64,5,10,12,8,1,512,0,False,...,1,0,0,0,0,1,0,0,0,1


In [135]:
def encode_is_touch_screen(df):
    # if touch screen 1, if not 0
    df['is_touch_screen']=np.where(df['is_touch_screen']==True,1,0)
    return df

df2=encode_is_touch_screen(df2)
df2

Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_is_SSD,primary_storage_capacity,secondary_storage_capacity,is_touch_screen,...,intel,other,amd,apple,arm,intel.1,nvidia,apple.1,dedicated,integrated
0,43990,53,5,6,12,8,1,512,0,0,...,0,0,1,0,0,0,0,0,0,1
1,119990,73,7,14,20,16,1,1024,0,0,...,1,0,0,0,0,0,1,0,1,0
2,45990,59,5,4,8,8,1,512,0,0,...,1,0,0,0,0,0,1,0,1,0
3,449990,89,9,24,32,4,1,2048,0,0,...,1,0,0,0,0,0,1,0,1,0
4,41490,45,3,8,8,8,1,512,0,0,...,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,5,6,12,16,1,512,0,0,...,0,0,1,0,0,0,0,0,0,1
689,59871,71,5,6,12,8,1,512,0,0,...,0,0,0,0,0,0,1,0,1,0
690,47990,63,5,4,8,8,1,512,0,0,...,1,0,0,0,0,1,0,0,0,1
691,74999,64,5,10,12,8,1,512,0,0,...,1,0,0,0,0,1,0,0,0,1


In [136]:
df2['OS'].value_counts()

OS
windows    647
dos         22
chrome      14
mac          9
other        1
Name: count, dtype: int64

In [137]:
def encode_OS(df):
    o_hot=OneHotEncoder()
    new_columns = o_hot.fit_transform(df[['OS']]).toarray()
    #change new_columns to ints
    new_columns=new_columns.astype(int)
    new_columns=pd.DataFrame(new_columns,columns=o_hot.categories_[0])
    df=pd.concat([df,new_columns],axis=1)
    df=df.drop(columns=['OS'])
    # take information how to decode
    return df

df2=encode_OS(df2)
df2


Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_is_SSD,primary_storage_capacity,secondary_storage_capacity,is_touch_screen,...,intel,nvidia,apple,dedicated,integrated,chrome,dos,mac,other,windows
0,43990,53,5,6,12,8,1,512,0,0,...,0,0,0,0,1,0,0,0,0,1
1,119990,73,7,14,20,16,1,1024,0,0,...,0,1,0,1,0,0,0,0,0,1
2,45990,59,5,4,8,8,1,512,0,0,...,0,1,0,1,0,0,0,0,0,1
3,449990,89,9,24,32,4,1,2048,0,0,...,0,1,0,1,0,0,0,0,0,1
4,41490,45,3,8,8,8,1,512,0,0,...,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,37990,60,5,6,12,16,1,512,0,0,...,0,0,0,0,1,0,0,0,0,1
689,59871,71,5,6,12,8,1,512,0,0,...,0,1,0,1,0,0,0,0,0,1
690,47990,63,5,4,8,8,1,512,0,0,...,1,0,0,0,1,0,0,0,0,1
691,74999,64,5,10,12,8,1,512,0,0,...,1,0,0,0,1,0,0,0,0,1


In [138]:
def convert_price_from_rupees_to_dollars(df):
    # 1 rupee is 0.013 dollars
    df['Price']=df['Price']*0.013
    return df

df2=convert_price_from_rupees_to_dollars(df2)
df2

Unnamed: 0,Price,Rating,processor_tier,num_cores,num_threads,ram_memory,primary_storage_is_SSD,primary_storage_capacity,secondary_storage_capacity,is_touch_screen,...,intel,nvidia,apple,dedicated,integrated,chrome,dos,mac,other,windows
0,571.870,53,5,6,12,8,1,512,0,0,...,0,0,0,0,1,0,0,0,0,1
1,1559.870,73,7,14,20,16,1,1024,0,0,...,0,1,0,1,0,0,0,0,0,1
2,597.870,59,5,4,8,8,1,512,0,0,...,0,1,0,1,0,0,0,0,0,1
3,5849.870,89,9,24,32,4,1,2048,0,0,...,0,1,0,1,0,0,0,0,0,1
4,539.370,45,3,8,8,8,1,512,0,0,...,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,493.870,60,5,6,12,16,1,512,0,0,...,0,0,0,0,1,0,0,0,0,1
689,778.323,71,5,6,12,8,1,512,0,0,...,0,1,0,1,0,0,0,0,0,1
690,623.870,63,5,4,8,8,1,512,0,0,...,1,0,0,0,1,0,0,0,0,1
691,974.987,64,5,10,12,8,1,512,0,0,...,1,0,0,0,1,0,0,0,0,1
