**Importing Libraries**  
We start by importing the necessary libraries, including scikit-learn for preprocessing tasks and Pandas for data manipulation.

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
import pandas as pd
import numpy as np
import re


# Dataset Description

We load the dataset 'smartphones.csv' and display the first few rows, shape, description, and information about the dataset. This helps us understand the data's structure and identify any missing values.

In [2]:
df = pd.read_csv('smartphones.csv')
df.head()


Unnamed: 0,Name,Brand,Price,Color,SIM Type,Hybrid Sim Slot,Display Size,Resolution,Resolution Type,Display Type,...,Width,Height,Depth,Weight,Quick Charging,Processor Core,Primary Clock Speed,Audio Jack,RAM,Expandable Storage
0,"APPLE iPhone 11 (Black, 128 GB)",APPLE Mobiles,"₹48,900",Black,Dual Sim,No,15.49 cm (6.1 inch),1792 x 828 Pixels,Liquid Retina HD Display,Liquid Retina HD,...,75.7 mm,150.9 mm,8.3 mm,194 g,,,,,,
1,"APPLE iPhone 11 (Black, 64 GB)",APPLE Mobiles,"₹43,900",Black,Dual Sim,No,15.49 cm (6.1 inch),1792 x 828 Pixels,Liquid Retina HD Display,,...,75.7 mm,150.9 mm,8.3 mm,194 g,,,,,,
2,"APPLE iPhone 11 (White, 128 GB)",APPLE Mobiles,"₹48,900",White,Dual Sim,No,15.49 cm (6.1 inch),1792 x 828 Pixels,Liquid Retina HD Display,Liquid Retina HD,...,75.7 mm,150.9 mm,8.3 mm,194 g,,,,,,
3,"APPLE iPhone 13 (Midnight, 128 GB)",APPLE Mobiles,"₹69,900",Midnight,Dual Sim,No,15.49 cm (6.1 inch),2532 x 1170 Pixels,Super Retina XDR Display,Super Retina XDR Display,...,71.5 mm,146.7 mm,7.65 mm,173 g,Yes,Hexa Core,,,,
4,"APPLE iPhone 13 (Green, 128 GB)",APPLE Mobiles,"₹69,900",Green,Dual Sim,No,15.49 cm (6.1 inch),2532 x 1170 Pixels,Super Retina XDR Display,Super Retina XDR Display,...,71.5 mm,146.7 mm,7.65 mm,173 g,Yes,Hexa Core,,,,


In [3]:
df.shape


(3296, 27)

In [4]:
df.describe()


Unnamed: 0,Name,Brand,Price,Color,SIM Type,Hybrid Sim Slot,Display Size,Resolution,Resolution Type,Display Type,...,Width,Height,Depth,Weight,Quick Charging,Processor Core,Primary Clock Speed,Audio Jack,RAM,Expandable Storage
count,3296,3284,3296,3296,3296,3208,3296,3296,2470,2273,...,2564,2568,2562,2535,1068,2972,2890,1674,2864,1431
unique,2914,13,442,768,4,2,72,286,17,197,...,203,276,146,131,2,6,60,14,12,11
top,"OnePlus 10R 5G (Sierra Black, 256 GB) (12 GB ...",SAMSUNG Mobiles,"₹17,999",Black,Dual Sim,No,16.51 cm (6.5 inch),2400 x 1080 Pixels,Full HD+,Full HD+ AMOLED Display,...,71.5 mm,146.7 mm,8.3 mm,188 g,Yes,Octa Core,2.4 GHz,3.5mm,8 GB,1 TB
freq,6,563,100,166,3057,2251,367,633,1217,215,...,105,84,125,103,1048,2553,478,1089,899,557


In [5]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3296 entries, 0 to 3295
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Name                 3296 non-null   object
 1   Brand                3284 non-null   object
 2   Price                3296 non-null   object
 3   Color                3296 non-null   object
 4   SIM Type             3296 non-null   object
 5   Hybrid Sim Slot      3208 non-null   object
 6   Display Size         3296 non-null   object
 7   Resolution           3296 non-null   object
 8   Resolution Type      2470 non-null   object
 9   Display Type         2273 non-null   object
 10  Processor Type       2550 non-null   object
 11  Internal Storage     3296 non-null   object
 12  Primary Camera       3293 non-null   object
 13  Secondary Camera     2645 non-null   object
 14  Network Type         3295 non-null   object
 15  Bluetooth Version    2440 non-null   object
 16  Batter

In [6]:
df.isna().sum().sum()


14371

In [7]:
df.isna().sum()


Name                      0
Brand                    12
Price                     0
Color                     0
SIM Type                  0
Hybrid Sim Slot          88
Display Size              0
Resolution                0
Resolution Type         826
Display Type           1023
Processor Type          746
Internal Storage          0
Primary Camera            3
Secondary Camera        651
Network Type              1
Bluetooth Version       856
Battery Capacity        333
Width                   732
Height                  728
Depth                   734
Weight                  761
Quick Charging         2228
Processor Core          324
Primary Clock Speed     406
Audio Jack             1622
RAM                     432
Expandable Storage     1865
dtype: int64

# Data Preprocessing  

We begin the data preprocessing phase. We perform various transformations to make the data more suitable for machine learning. We start by converting strings in object columns to lowercase for consistency.

In [8]:
object_cols = df.select_dtypes('object')
df[object_cols.columns] = object_cols.apply(lambda col: col.str.lower())


### Brand Preprocessing

- We address missing values in the 'Brand' column by extracting the brand from the 'Name' column where it's missing.
- We encode the 'Brand' column using Label Encoding to convert brand names into numerical categories.

In [9]:
df['Brand'].isna().sum()


12

In [10]:
df['Brand'].value_counts()


Brand
samsung mobiles     563
apple mobiles       432
realme mobiles      392
vivo mobiles        308
redmi mobiles       277
oppo mobiles        275
mi mobiles          264
oneplus mobiles     215
iqoo mobiles        159
poco mobiles        137
asus mobiles        135
motorola mobiles    113
google mobiles       14
Name: count, dtype: int64

In [11]:
df['Brand'] = df['Brand'].str.replace('mobiles', '').str.strip()
df['Brand'].value_counts()


Brand
samsung     563
apple       432
realme      392
vivo        308
redmi       277
oppo        275
mi          264
oneplus     215
iqoo        159
poco        137
asus        135
motorola    113
google       14
Name: count, dtype: int64

In [12]:
df[['Name', 'Brand']][df['Brand'].isna()]


Unnamed: 0,Name,Brand
1117,"nothing phone (2) (white, 512 gb) (12 gb ram)",
1118,"nothing phone (2) (dark grey, 128 gb) (8 gb ram)",
1119,"nothing phone (1) (white, 256 gb) (8 gb ram)",
1120,"nothing phone (1) (black, 256 gb) (8 gb ram)",
1121,"nothing phone (1) (black, 128 gb) (8 gb ram)",
1122,"nothing phone (2) (dark grey, 256 gb) (12 gb ...",
1123,"nothing phone (2) (white, 256 gb) (12 gb ram)",
1124,"nothing phone (2) (dark grey, 512 gb) (12 gb ...",
1125,"nothing phone (1) (black, 256 gb) (12 gb ram)",
1126,"nothing phone (1) (white, 256 gb) (12 gb ram)",


In [13]:
def get_brand(name):
    return name.split(' ')[0]

new_brand = df['Name'].apply(get_brand)
df['Brand'].fillna(new_brand, inplace=True)
df.drop(columns=['Name'], inplace=True)
df['Brand'].isna().sum()


0

In [14]:
le = LabelEncoder()
df['Brand'] = le.fit_transform(df['Brand'])
df['Brand'] = df['Brand'].astype('category')
df['Brand'].value_counts()


Brand
12    563
0     432
10    392
13    308
11    279
8     275
4     264
7     215
3     159
9     137
1     135
5     113
2      14
6      10
Name: count, dtype: int64

### Price Preprocessing

We clean the 'Price' column by removing currency symbols and commas and converting it to an integer.

In [15]:
def clean_price(price_str):
    return price_str.replace('₹', '').replace(',', '')

df['Price'] = df['Price'].apply(clean_price).astype(int)


### Color Preprocessing

- We clean the 'Color' column by categorizing similar colors and handling values that occur less frequently.
- We encode the 'Color' column using Label Encoding.

In [16]:
df['Color'].value_counts()


Color
black              166
gold               109
blue                82
silver              81
white               69
                  ... 
silk white           1
?stardust brown      1
bronze               1
stardust silver      1
racing black         1
Name: count, Length: 734, dtype: int64

In [17]:
def clean_var(choices, df, var):
    for choice in choices:
        idx = df[var][df[var].str.contains(choice)].index
        df.loc[idx, var] = choice

colors = ['black', 'blue', 'green', 'white', 'gold', 'silver', 'red', 'grey', 'gray', 'purple', 'night', 'yellow', 'orange', 'graphite', 'sky', 'mint', 'voilet', 'diamond', 'starlight', 'coral', 'sunset', 'cream', 'copper', 'brown', 'pink']
clean_var(colors, df, 'Color')
df['Color'].value_counts()


Color
black             817
blue              658
green             261
white             244
gold              204
                 ... 
bronze              1
viva magneta        1
satin maroon        1
radiant mist        1
marble odyssey      1
Name: count, Length: 131, dtype: int64

In [18]:
color_groups = {
    'blue': ['sky', 'night'],
    'red': ['coral', 'pink', 'sunset'],
    'green': ['mint'],
    'orange': ['copper', 'brown'],
    'silver': ['diamond', 'starlight'],
    'gray': ['grey', 'graphite'],
    'purple': ['voilet', 'lavender']
}

for color_group, colors in color_groups.items():
    for color in colors:
        df['Color'].replace(color, color_group, inplace=True)


In [19]:
extra_colors = df['Color'].value_counts()[df['Color'].value_counts() < 30].index
for color in extra_colors:
    idx = df['Color'][df['Color'].str.contains(color)].index
    df.loc[idx, 'Color'] = 'others'
df['Color'].value_counts()


Color
black     817
blue      726
green     282
others    279
white     244
silver    220
gold      204
gray      178
red       160
purple     96
orange     49
yellow     41
Name: count, dtype: int64

In [20]:
le = LabelEncoder()
df['Color'] = le.fit_transform(df['Color'])
df['Color'] = df['Color'].astype('category')
df['Color'].value_counts()


Color
0     817
1     726
4     282
6     279
10    244
9     220
2     204
3     178
8     160
7      96
5      49
11     41
Name: count, dtype: int64

### SIM Type Preprocessing

We encode the 'SIM Type' column using Label Encoding.

In [21]:
le = LabelEncoder()
df['SIM Type'] = le.fit_transform(df['SIM Type'])
df['SIM Type'] = df['SIM Type'].astype('category')
df['SIM Type'].value_counts()


SIM Type
0    3057
3     140
1      94
2       5
Name: count, dtype: int64

### Hybrid Sim Slot Preprocessing

- We handle missing values in the 'Hybrid Sim Slot' column based on the 'SIM Type.'
- We encode the 'Hybrid Sim Slot' column using Label Encoding.

In [22]:
df['Hybrid Sim Slot'].isna().sum()


88

In [23]:
df['Hybrid Sim Slot'].value_counts()


Hybrid Sim Slot
no     2251
yes     957
Name: count, dtype: int64

In [24]:
no_hsim = df['Hybrid Sim Slot'][df['Hybrid Sim Slot'].isna()][df['SIM Type'] == 'single sim'].index
df.loc[no_hsim, 'Hybrid Sim Slot'] = 'no'


In [25]:
df['Hybrid Sim Slot'] = df['Hybrid Sim Slot'].apply(lambda x: 1 if x == 'yes' else (0 if x == 'no' else np.nan))
df['Hybrid Sim Slot'].value_counts()


Hybrid Sim Slot
0.0    2251
1.0     957
Name: count, dtype: int64

In [26]:
imputer = KNNImputer()
imputed_data = imputer.fit_transform(df[['Hybrid Sim Slot']])
imputed_data = pd.Series([l[0] for l in list(imputed_data)], name='Hybrid Sim Slot')
df['Hybrid Sim Slot'] = imputed_data.astype(int).astype('category')
df['Hybrid Sim Slot'].isna().sum()


0

### Display Size Preprocessing

We extract and rename the 'Display Size' column to 'Display Size (cm).'

In [27]:
df['Display Size'] = df['Display Size'].str.extract(r'(\d+\.*\d*)\s*cm').astype(float)
df.rename(columns={'Display Size': 'Display Size (cm)'}, inplace=True)


### Display & Resolution Type Preprocessing

- We clean and categorize the 'Display Type' and 'Resolution Type' columns.
- We handle missing values in both columns based on the other.
- We encode both columns using Label Encoding.

#### Display Type

In [28]:
df['Display Type'].value_counts()


Display Type
full hd+ amoled display                   215
super retina xdr display                  197
full hd+ display                          118
full hd+ super amoled display             103
super amoled                               95
                                         ... 
ltps ips with tol                           1
pls tft lcd                                 1
optic amoled                                1
dynamic amoled 2x - infinity-o display      1
oled fhd+display                            1
Name: count, Length: 189, dtype: int64

In [29]:
df['Display Type'] = df['Display Type'].str.replace('display', '').str.strip()
df['Display Type'].fillna('unknown', inplace=True)
choices = ['retina', 'amoled', ' oled', 'oled ', 'lcd', 'ips', 'hd', 'tft']
clean_var(choices, df, 'Display Type')


In [30]:
disp_groups = {
    'oled': [' oled', 'oled ']
}

for disp_group, disps in disp_groups.items():
    for disp in disps:
        df['Display Type'].replace(disp, disp_group, inplace=True)


In [31]:
extra_disps = df['Display Type'].value_counts()[df['Display Type'].value_counts() < 40].index
for disp in extra_disps:
    idx = df['Display Type'][df['Display Type'].str.contains(disp)].index
    df.loc[idx, 'Display Type'] = 'others'
df['Display Type'].value_counts()


Display Type
unknown    1023
amoled      804
lcd         633
hd          235
retina      229
ips         196
oled         95
tft          49
others       32
Name: count, dtype: int64

In [32]:
le = LabelEncoder()
df['Display Type'] = le.fit_transform(df['Display Type'])
df['Display Type'].value_counts()


Display Type
8    1023
0     804
3     633
1     235
6     229
2     196
4      95
7      49
5      32
Name: count, dtype: int64

#### Resolution Type

In [33]:
df['Resolution Type'].value_counts()


Resolution Type
full hd+                            1217
hd+                                  444
super retina xdr display             221
full hd+ amoled display              135
retina hd display                    125
full hd                              107
hd                                    63
quad hd+                              44
full hd+ super amoled display         36
retina display                        23
super retina hd display               21
liquid retina hd display              12
quad hd                                9
full hd+ e3 super amoled display       7
qxga+                                  3
wvga                                   2
wqhd                                   1
Name: count, dtype: int64

In [34]:
df['Resolution Type'] = df['Resolution Type'].str.replace('display', '').str.strip()
df['Resolution Type'].fillna('unknown', inplace=True)
choices = ['retina', 'amoled', 'hd']
clean_var(choices, df, 'Resolution Type')


In [35]:
res_groups = {
    'amoled': ['qxga+', 'wvga']
}

for res_group, ress in res_groups.items():
    for res in ress:
        df['Resolution Type'].replace(res, res_group, inplace=True)
df['Resolution Type'].value_counts()


Resolution Type
hd         1885
unknown     826
retina      402
amoled      183
Name: count, dtype: int64

In [36]:
le = LabelEncoder()
df['Resolution Type'] = le.fit_transform(df['Resolution Type'])
df['Resolution Type'].value_counts()


Resolution Type
1    1885
3     826
2     402
0     183
Name: count, dtype: int64

#### Handling NULL Values

In [37]:
df.loc[df['Display Type'][df['Display Type'] == 8].index, 'Display Type'] = np.nan

df.loc[df['Resolution Type'][df['Resolution Type'] == 3].index, 'Resolution Type'] = np.nan


In [38]:
df['Display Type'].isna().sum()


1023

In [39]:
res_to_disp = df['Display Type'].isna().index
df['Display Type'].fillna(df['Resolution Type'][res_to_disp], inplace=True)
df['Display Type'].isna().sum()


725

In [40]:
df['Resolution Type'].isna().sum()


826

In [41]:
disp_to_res = df['Resolution Type'].isna().index
df['Resolution Type'].fillna(df['Display Type'][disp_to_res], inplace=True)
df['Resolution Type'].isna().sum()


725

In [42]:
imputer = KNNImputer()
imputed_data = imputer.fit_transform(df[['Display Type', 'Resolution Type']])
imputed_data = pd.DataFrame(imputed_data, columns=['Display Type', 'Resolution Type'])
df['Display Type'] = imputed_data['Display Type'].astype(int).astype('category')
df['Resolution Type'] = imputed_data['Resolution Type'].astype(int).astype('category')
df['Display Type'].isna().sum()


0

In [43]:
df['Resolution Type'].isna().sum()


0

### Resolution Prepocessing

We extract pixel width and height from the 'Resolution' column.

In [44]:
px_w = df['Resolution'].apply(lambda x: re.findall('[0-9]+', x)[0]).astype(int)
px_w.name = 'Pixel Width'
px_h = df['Resolution'].apply(lambda x: re.findall('[0-9]+', x)[1]).astype(int)
px_h.name = 'Pixel Height'
df = pd.concat([df, px_w, px_h], axis=1)
df.drop(columns=['Resolution'], inplace=True)


### Processor Type, Core & Primary Clock Speed Preprocessing

- We clean and categorize the 'Processor Type' column.
- We handle missing values in 'Processor Type.'
- We encode the 'Processor Type' column using Label Encoding.
- We clean the 'Processor Core' column.
- We handle missing values in 'Processor Core.'
- We encode the 'Processor Core' column using Label Encoding.
- We convert 'Primary Clock Speed' from GHz to MHz, handle missing values, and impute them.

#### Processor Type

In [45]:
df['Processor Core'].isna().sum()


324

In [46]:
df['Processor Type'].value_counts()


Processor Type
a15 bionic chip                                                                                                                                                                        71
mediatek helio p35                                                                                                                                                                     67
qualcomm snapdragon 680                                                                                                                                                                60
a14 bionic chip with next generation neural engine                                                                                                                                     59
a12 bionic chip                                                                                                                                                                        47
                                                       

In [47]:
df['Processor Type'].fillna('unknown', inplace=True)

choices = ['qualcomm', 'snapdragon', 'mediatek', 'dimensity', 'helio', 'apple', 'bionic chip', 'chip', 'exynos', 'unisoc', 'intel', 'tensor', 'octa core']
clean_var(choices, df, 'Processor Type')


In [48]:
proc_groups = {
    'qualcomm snapdragon': ['qualcomm', 'snapdragon'],
    'samsung exynos': ['exynos'],
    'mediatek dimensity': ['dimensity'],
    'mediatek helio': ['helio'],
    'google tensor': ['tensor'],
    'apple chip': ['apple', 'bionic chip', 'chip'],
}

for proc_group, procs in proc_groups.items():
    for proc in procs:
        df['Processor Type'].replace(proc, proc_group, inplace=True)


In [49]:
df['Processor Type'] = df['Processor Type'].str.replace('(', '').str.replace(')', '')
extra_procs = df['Processor Type'].value_counts()[df['Processor Type'].value_counts() < 10].index
for proc in extra_procs:
    idx = df['Processor Type'][df['Processor Type'].str.contains(proc)].index
    df.loc[idx, 'Processor Type'] = 'others'
df['Processor Type'].value_counts()


Processor Type
qualcomm snapdragon    1045
unknown                 746
mediatek                661
apple chip              432
samsung exynos          190
mediatek dimensity       73
mediatek helio           34
others                   29
unisoc                   26
octa core                25
intel                    24
google tensor            11
Name: count, dtype: int64

In [50]:
le = LabelEncoder()
df['Processor Type'] = le.fit_transform(df['Processor Type'])
df['Processor Type'] = df['Processor Type'].astype('category')
df['Processor Type'].value_counts()


Processor Type
8     1045
11     746
3      661
0      432
9      190
4       73
5       34
7       29
10      26
6       25
2       24
1       11
Name: count, dtype: int64

In [51]:
df.loc[df['Processor Type'][df['Processor Type'] == 11].index, 'Processor Type'] = np.nan
imputer = KNNImputer()
imputed_data = imputer.fit_transform(df[['Processor Type']])
imputed_data = pd.Series([l[0] for l in list(imputed_data)], name='Processor Type')
df['Processor Type'] = imputed_data.astype(int).astype('category')
df['Processor Type'].isna().sum()


0

#### Processor Core

In [52]:
df['Processor Core'].isna().sum()


324

In [53]:
df['Processor Core'].value_counts()


Processor Core
octa core      2553
dual core       193
hexa core       151
quad core        65
deca core         8
single core       2
Name: count, dtype: int64

In [54]:
le = LabelEncoder()
df['Processor Core'] = le.fit_transform(df['Processor Core'])
df['Processor Core'] = df['Processor Core'].astype('category')
df['Processor Core'].value_counts()


Processor Core
3    2553
6     324
1     193
2     151
4      65
0       8
5       2
Name: count, dtype: int64

In [55]:
df.loc[df['Processor Core'][df['Processor Core'] == 6].index, 'Processor Core'] = np.nan
df['Processor Core'].value_counts()
imputer = KNNImputer()
imputed_data = imputer.fit_transform(df[['Processor Core']])
imputed_data = pd.Series([l[0] for l in list(imputed_data)], name='Processor Core')
df['Processor Core'] = imputed_data.astype(int).astype('category')
df['Processor Core'].isna().sum()


0

#### Primary Clock Speed

In [56]:
df['Primary Clock Speed'].isna().sum()


406

In [57]:
df['Primary Clock Speed'].value_counts()


Primary Clock Speed
2.4 ghz      478
2 ghz        477
2.2 ghz      426
2.3 ghz      361
1.8 ghz      141
3.2 ghz      111
2.5 ghz       70
3 ghz         69
2.6 ghz       68
2.05 ghz      64
2.8 ghz       62
2.84 ghz      58
3.36 ghz      44
1.6 ghz       35
1.84 ghz      32
2.99 ghz      30
1.4 ghz       29
1.5 ghz       25
2.73 ghz      25
1.95 ghz      24
1.7 ghz       23
2.96 ghz      23
2.1 ghz       22
2.85 ghz      20
2.2 mhz       17
3.1 ghz       13
2.9 ghz       12
3.18 ghz      12
3.05 ghz      11
2.7 ghz       10
1.82 ghz       8
1.3 ghz        8
2.4 mhz        8
3.19 ghz       7
2.91 ghz       6
2.42 mhz       6
3.2 mhz        5
2.86 ghz       5
1.2 ghz        4
2.3 mhz        4
90 mhz         4
2 mhz          3
2.995 ghz      3
1.25 ghz       3
1600 mhz       3
90 ghz         2
950 mhz        2
2.39 ghz       2
1.9 ghz        2
2.15 ghz       2
900 mhz        2
2.26 ghz       1
2350 mhz       1
3.05 mhz       1
2.8 mhz        1
3.19 mhz       1
2.649 ghz      1
3.09 ghz   

In [58]:
def ghz_to_mhz(value):
    if 'ghz' in value:
        value = float(value.replace(' ghz', '')) * 1000
        return f'{value} mhz'
    return value

df['Primary Clock Speed'].fillna('unknown', inplace=True)
df['Primary Clock Speed'] = df['Primary Clock Speed'].apply(ghz_to_mhz).str.replace(' mhz', '').replace('unknown', np.nan).astype(float)


In [59]:
imputer = KNNImputer()
imputed_data = imputer.fit_transform(df[['Primary Clock Speed']])
imputed_data = pd.Series([l[0] for l in list(imputed_data)], name='Primary Clock Speed')
df['Primary Clock Speed'] = imputed_data
df['Primary Clock Speed'].isna().sum()


0

### Primary & Secondary Camera Preprocessing

- We clean and categorize the 'Primary Camera' and 'Secondary Camera' columns.
- We handle missing values and impute them using K-Nearest Neighbors.

#### Primary Camera

In [60]:
df['Primary Camera'].isna().sum()


3

In [61]:
df['Primary Camera'].value_counts()


Primary Camera
50mp rear camera           292
12mp + 12mp                190
13mp rear camera           164
64mp rear camera           153
12mp rear camera           140
                          ... 
12mp + 5mp + 4mp             1
48mp + 5mp + 5mp             1
10mp rear camera             1
48mp + 5mp + 16mp            1
48mp + 13mp + 8mp + 8mp      1
Name: count, Length: 180, dtype: int64

In [62]:
def clean_cam(value):
    if not isinstance(value, float):
        cam = re.findall('[0-9]+', value)
        if len(cam) != 0:
            return int(cam[0])
    return value
df['Primary Camera'] = df['Primary Camera'].apply(clean_cam)


#### Secondary Camera

In [63]:
df['Secondary Camera'].isna().sum()


651

In [64]:
df['Secondary Camera'].value_counts()


Secondary Camera
16mp front camera                                                                    645
8mp front camera                                                                     443
32mp front camera                                                                    276
12mp front camera                                                                    265
5mp front camera                                                                     262
13mp front camera                                                                    212
7mp front camera                                                                     124
20mp front camera                                                                     99
10mp front camera                                                                     45
1.2mp front camera                                                                    43
16mp + 16mp dual front camera                                                         24
25mp

In [65]:
df['Secondary Camera'] = df['Secondary Camera'].apply(clean_cam)


In [66]:
imputer = KNNImputer()
imputed_data = imputer.fit_transform(df[['Primary Camera', 'Secondary Camera']])
imputed_data = pd.DataFrame(imputed_data, columns=['Primary Camera', 'Secondary Camera'])
df['Primary Camera'] = imputed_data['Primary Camera'].astype(int)
df['Secondary Camera'] = imputed_data['Secondary Camera'].astype(int)
df['Primary Camera'].isna().sum()


0

In [67]:
df['Secondary Camera'].isna().sum()


0

### Network Type Preprocessing

- We clean the 'Network Type' column and handle missing values.
- We encode the 'Network Type' column using Label Encoding.

In [68]:
df['Network Type'].isna().sum()


1

In [69]:
df['Network Type'].value_counts()


Network Type
5g, 4g, 3g, 2g              538
4g volte, 4g, 3g, 2g        514
5g                          387
5g, 4g volte, 4g, 3g, 2g    322
4g, 3g, 2g                  311
3g, 4g, 2g                  243
4g volte                    149
3g, 4g volte, 2g            135
3g, 4g volte, 4g, 2g        127
4g                          108
5g, 4g volte                105
2g, 3g, 4g, 5g               41
4g volte, 4g                 33
4g volte, 3g                 32
2g, 3g, 4g                   29
5g, 4g volte, 4g             24
3g                           23
5g, 4g, 3g                   18
3g, 2g                       17
3g, 4g volte, 4g             16
3g, 4g                       13
2g, 3g, 4g volte, 5g         12
2g, 3g, 4g, 4g volte         11
4g volte, 4g, 2g, 3g         11
3g, 4g volte                 10
2g, 3g, 4g, 4g volte, 5g      9
4g volte, 3g, 2g              8
4g, 2g, 3g                    8
4g volte, 5g                  7
4g, 3g                        7
5g, 4g, 4g volte, 3g       

In [70]:
def clean_ntype(value):
    if not isinstance(value, float):
        return max(set(re.findall('[0-9]+', value)))
    return value

df['Network Type'] = df['Network Type'].apply(clean_ntype).astype(float)
df['Network Type'].value_counts()


Network Type
4.0    1774
5.0    1477
3.0      40
2.0       4
Name: count, dtype: int64

In [71]:
df['Network Type'].fillna(df['Network Type'].median(), inplace=True)
df['Network Type'] = df['Network Type'].astype(int).astype('category')
df['Network Type'].isna().sum()


0

### Bluetooth Version Preprocessing

- We clean the 'Bluetooth Version' column and handle missing values.
- We encode the 'Bluetooth Version' column using Label Encoding.

In [72]:
df['Bluetooth Version'].isna().sum()


856

In [73]:
df['Bluetooth Version'].value_counts()


Bluetooth Version
v5.0                                          865
v5.1                                          328
v5.2                                          321
5                                             276
v5.3                                          206
4.2                                           147
4                                             119
v4.2                                           79
4.1                                            32
5.1                                            28
5.2                                             7
2.1, 4.0, 5.0                                   7
v4.1                                            5
v5.0, v4.0, v2.1 + edr                          4
5.2, a2dp, le, aptx hd, aptx adaptive           3
2.1                                             3
5.3                                             3
v5.3, ble                                       2
'v5.0                                           2
bluetooth v5.1, bluetooth low en

In [74]:
def clean_blue(value):
    if not isinstance(value, float):
        val = re.findall(r'\d+\.\d+|\d+', value)
        if len(val) != 0:
            return float(max(val))
df['Bluetooth Version'] = df['Bluetooth Version'].apply(clean_blue)


In [75]:
df['Bluetooth Version'].fillna(df['Bluetooth Version'].median(), inplace=True)
df['Bluetooth Version'].isna().sum()


0

### Width, Height, Depth and Weight Preprocessing

- We clean and transform the 'Width,' 'Height,' 'Depth,' and 'Weight' columns.
- We handle missing values using K-Nearest Neighbors imputation.

#### Width

In [76]:
df['Width'].isna().sum()


732

In [77]:
df['Width'].unique()


array(['75.7 mm', '71.5 mm', '67.3 mm', '78.1 mm', '77.6 mm', '64.2 mm',
       '77.4 mm', '77.8 mm', '67.1 mm', '77.9 mm', '71.4 mm', '70.9 mm',
       '67 mm', '58.6 mm', '59.2 mm', '77.2 mm', '77.5 mm', '76 mm',
       '77.38 mm', '73.7 mm', '75.65 mm', '75.44 mm', '84 mm', nan,
       '76.28 mm', '73.98 mm', '76.24 mm', '77 mm', '71.74 mm', '76.7 mm',
       '78.84 mm', '77.25 mm', '68.5 mm', '75.51 mm', '84.3 mm', '78 mm',
       '72.8 mm', '61.44 mm', '76.16 mm', '71.8 mm', '73.2 mm', '72.9 mm',
       '76.6 mm', '68.2 mm', '158 mm', '16.1 mm', '75.2 mm', '7.7 mm',
       '15.9 mm', '16.3 mm', '16.4 mm', '16.2 mm', '21.1 mm', '16 mm',
       '16.5 mm', '75.72 mm', '76.8 mm', '76.4 mm', '75.73 mm', '74.6 mm',
       '76.19 mm', '75.45 mm', '75.41 mm', '75.21 mm', '76.2 mm',
       '75.8 mm', '69.6 mm', '77.07 mm', '75.35 mm', '75.4 mm',
       '76.68 mm', '88.3 mm', '69.2 mm', '77.26 mm', '74.3 mm',
       '71.85 mm', '75.58 mm', '73.6 mm', '74.8 mm', '75.3 mm', '8.35 mm',
       

In [78]:
df['Width'] = df['Width'].str.replace(' mm', '').astype(float)


#### Height

In [79]:
df['Height'].isna().sum()


728

In [80]:
df['Height'].unique()


array(['150.9 mm', '146.7 mm', '138.4 mm', '160.8 mm', '147.5 mm',
       '160.7 mm', '131.5 mm', '157.5 mm', '158 mm', '138.3 mm',
       '158.2 mm', '144 mm', '143.6 mm', '138.1 mm', '158.1 mm',
       '158.4 mm', '123.8 mm', '124.4 mm', '152.5 mm', '156.5 mm',
       '156 mm', '10.9 m', '149 mm', '152.59 mm', '156.4 mm', '149.5 mm',
       '153 mm', '159.1 mm', '164.55 mm', nan, '158.41 mm', '143.7 mm',
       '146.87 mm', '151.4 mm', '159 mm', '154.3 mm', '141.18 mm',
       '151.45 mm', '158.9 mm', '172.83 mm', '148 mm', '157.96 mm',
       '166.9 mm', '171 mm', '170.99 mm', '109 mm', '148.2 mm',
       '124.42 mm', '158.83 mm', '152.2 mm', '155.6 mm', '152 mm',
       '162.9 mm', '145.6 mm', '76.7 mm', '7.4 mm', '164.8 mm', '16.5 mm',
       '7.5 mm', '158.51 mm', '7.9 mm', '7.6 mm', '7.8 mm', '10.3 mm',
       '160.53 mm', '165.38 mm', '163.7 mm', '163.6 mm', '163.65 mm',
       '158.5 mm', '156.48 mm', '159.21 mm', '165.1 mm', '157.9 mm',
       '151 mm', '155.4 mm', '164.9 mm'

In [81]:
df['Height'] = df['Height'].str.replace(' mm', '').str.replace(' m', '').astype(float)


#### Depth

In [82]:
df['Depth'].isna().sum()


734

In [83]:
df['Depth'].unique()


array(['8.3 mm', '7.65 mm', '7.8 mm', '7.3 mm', '7.4 mm', '7.85 mm',
       '7.7 mm', '8.1 mm', '7.1 mm', '6.9 mm', '7.5 mm', '7.6 mm',
       '8.97 mm', '10.9 m', '10.8 mm', '10.55 mm', '152.5 mm', '7.9 mm',
       '7.69 mm', '8.55 mm', '9.67 mm', nan, '10.5 mm', '8.46 mm',
       '7.99 mm', '7.95 mm', '11.95 mm', '9.85 mm', '8.9 mm', '8.5 mm',
       '9.9 mm', '8.85 mm', '10.29 mm', '9.78 mm', '10.34 mm', '11.2 mm',
       '8.7 mm', '9 mm', '0.8 mm', '8.8 mm', '0.9 mm', '9.16 mm',
       '6.2 mm', '6.81 mm', '8.16 mm', '8.34 mm', '8.05 mm', '9.4 mm',
       '9.33 mm', '8.26 mm', '8.4 mm', '7.25 mm', '8.475 mm', '8.47 mm',
       '8.96 mm', '8.35 mm', '75.3 mm', '8.75 mm', '8.59 mm', '8.49 mm',
       '8.29 mm', '7.58 mm', '6.79 mm', '8.39 mm', '7.45 mm', '7.49 mm',
       '9.6 mm', '9.18 mm', '9.15 mm', '8.99 mm', '9.09 mm', '8.25 mm',
       '9.89 mm', '9.13 mm', '6.99 mm', '8.89 mm', '9.19 mm', '8.79 mm',
       '8.98 mm', '9.97 mm', '8.6 mm', '8.2 mm', '8 mm', '7.89 mm',
       '7

In [84]:
df['Depth'] = df['Depth'].str.replace(' mm', '').str.replace(' m', '').astype(float)


#### Weight

In [85]:
df['Weight'].isna().sum()


761

In [86]:
df['Weight'].unique()


array(['194 g', '173 g', '172 g', '144 g', '203 g', '162 g', '206 g',
       '240 g', '140 g', '148 g', '238 g', '208 g', '226 g', '143 g',
       '187 g', '133 g', '188 g', '192 g', '138 g', '177 g', '129 g',
       '202 g', '112 g', '132 g', '174 g', '170 g', '150 g', '155 g',
       '165 g', '195 g', '190 g', '160 g', '175 g', '180 g', '147 g',
       '120 g', nan, '185 g', '169 g', '196 g', '242 g', '145 g', '116 g',
       '200 g', '178 g', '197 g', '193.5 g', '212 g', '454 g', '214.5 g',
       '158 g', '157 g', '205 g', '204 g', '186 g', '216 g', '154 g',
       '181 g', '130 g', '168 g', '209 g', '0.45 kg', '191 g', '173.8 g',
       '146 g', '149 g', '184 g', '171 g', '168.3 g', '198.5 g', '220 g',
       '176 g', '221 g', '210 g', '179 g', '166.8 g', '163 g', '179.5 g',
       '201.2 g', '183 g', '199 g', '153 g', '189 g', '182 g', '193 g',
       '189.6 g', '164 g', '137 g', '161 g', '152 g', '141 g', '215 g',
       '156 g', '201 g', '213 g', '198 g', '225 g', '189.5 g', '1

In [87]:
def kg_to_g(value):
    if 'kg' in value:
        value = float(value.replace(' kg', '')) * 1000
        return f'{value} g'
    return value

df['Weight'].fillna('unknown', inplace=True)
df['Weight'] = df['Weight'].apply(kg_to_g).str.replace(' g', '').str.replace(' gm', '').str.replace('m', '').replace('unknown', np.nan).astype(float)


#### Handling NULL Values

In [88]:
cols = ['Width', 'Height', 'Depth', 'Weight']
imputer = KNNImputer()
imputed_data = imputer.fit_transform(df[cols])
imputed_data = pd.DataFrame(imputed_data, columns=cols)
for col in cols:
    df[col] = imputed_data[col]
    print(f'NULL Values in {col}: {df[col].isna().sum()}')


NULL Values in Width: 0
NULL Values in Height: 0
NULL Values in Depth: 0
NULL Values in Weight: 0


### Quick Charging Preprocessing

- We handle missing values in the 'Quick Charging' column.
- We encode the 'Quick Charging' column using Label Encoding.

In [89]:
df['Quick Charging'].isna().sum()


2228

In [90]:
df['Quick Charging'].value_counts()


Quick Charging
yes    1048
no       20
Name: count, dtype: int64

In [91]:
df['Quick Charging'].fillna('no', inplace=True)
df['Quick Charging'].isna().sum()


0

In [92]:
df['Quick Charging'] = df['Quick Charging'].apply(lambda x: 1 if x == 'yes' else 0).astype('category')
df['Quick Charging'].value_counts()


Quick Charging
0    2248
1    1048
Name: count, dtype: int64

### Audio Jack Preprocessing

- We clean and categorize the 'Audio Jack' column.
- We handle missing values and encode the 'Audio Jack' column using Label Encoding.

In [93]:
df['Audio Jack'].isna().sum()


1622

In [94]:
df['Audio Jack'].value_counts()


Audio Jack
3.5mm            1089
3.5 mm            259
type c            125
yes                47
usb type c         35
type-c             31
usb (type c)       31
3.5                22
usb type-c         16
usb (type-c)       10
no                  7
?3.5 mm             1
3.5 mm stereo       1
Name: count, dtype: int64

In [95]:
df['Audio Jack'].fillna('no', inplace=True)
choices = ['3.5', 'type c', 'type-c']
clean_var(choices, df, 'Audio Jack')
df['Audio Jack'].value_counts()


Audio Jack
no        1629
3.5       1372
type c     191
type-c      57
yes         47
Name: count, dtype: int64

In [96]:
aud_groups = {
    'yes': ['type-c', 'type c', '3.5']
}

for aud_group, auds in aud_groups.items():
    for aud in auds:
        df['Audio Jack'].replace(aud, aud_group, inplace=True)
df['Audio Jack'].value_counts()


Audio Jack
yes    1667
no     1629
Name: count, dtype: int64

In [97]:
df['Audio Jack'] = df['Audio Jack'].apply(lambda x: 1 if x == 'yes' else 0).astype('category')
df['Audio Jack'].value_counts()


Audio Jack
1    1667
0    1629
Name: count, dtype: int64

### Battery Capacity Preprocessing

We clean the 'Battery Capacity' column and handle missing values.

In [98]:
df['Battery Capacity'].isna().sum()


333

In [99]:
df['Battery Capacity'].value_counts()


Battery Capacity
5000 mah    1282
4500 mah     308
6000 mah     232
4000 mah     174
3000 mah      81
            ... 
3120 mah       1
3430 mah       1
3520 mah       1
3095 mah       1
4315 mah       1
Name: count, Length: 92, dtype: int64

In [100]:
df['Battery Capacity'] = df['Battery Capacity'].str.replace(' mah', '').astype(float)
df['Battery Capacity'].fillna(int(df['Battery Capacity'].mean()), inplace=True)
df['Battery Capacity'] = df['Battery Capacity'].astype(int)
df['Battery Capacity'].isna().sum()


0

### Internal Storage & RAM Preprocessing

- We clean and transform the 'Internal Storage' and 'RAM' columns.
- We handle missing values and convert values from TB to GB.

#### Internal Storage

In [101]:
df['Internal Storage'].isna().sum()


0

In [102]:
df['Internal Storage'].value_counts()


Internal Storage
128 gb    1587
64 gb      732
256 gb     557
32 gb      210
512 gb      90
16 gb       72
1 tb        24
8 gb        16
6 gb         7
4 gb         1
Name: count, dtype: int64

In [103]:
def tb_to_gb(value):
    if 'tb' in value:
        value = int(value.replace(' tb', '')) * 1024
        return f'{value} gb'
    return value

df['Internal Storage'] = df['Internal Storage'].apply(tb_to_gb).str.replace(' gb', '').astype(int)


#### RAM Preprocessing

In [104]:
df['RAM'].isna().sum()


432

In [105]:
df['RAM'].value_counts()


RAM
8 gb      899
6 gb      711
4 gb      707
12 gb     266
3 gb      193
2 gb       57
16 gb      12
128 gb      7
1 gb        6
18 gb       2
10 gb       2
1.5 gb      2
Name: count, dtype: int64

In [106]:
df['RAM'] = df['RAM'].str.replace(' gb', '').astype(float)
df['RAM'].fillna(df['RAM'].median(), inplace=True)
df['RAM'] = df['RAM'].astype(int)
df['RAM'].isna().sum()


0

### Expandable Storage Preprocessing

We handle missing values in the 'Expandable Storage' column and impute them based on 'Internal Storage.'

In [107]:
df['Expandable Storage'].isna().sum()


1865

In [108]:
df['Expandable Storage'].value_counts()


Expandable Storage
1 tb      557
256 gb    430
512 gb    261
128 gb     75
2 tb       74
64 gb      25
32 gb       3
400 gb      3
7 gb        1
200 gb      1
12 gb       1
Name: count, dtype: int64

In [109]:
df['Expandable Storage'].fillna('unknown', inplace=True)
df['Expandable Storage'] = df['Expandable Storage'].apply(tb_to_gb).str.replace(' gb', '').replace('unknown', np.nan).astype(float)


In [110]:
df.groupby('Internal Storage')['Expandable Storage'].median()


Internal Storage
4          NaN
6          NaN
8         64.0
16       128.0
32       256.0
64       512.0
128     1024.0
256     1024.0
512      512.0
1024       NaN
Name: Expandable Storage, dtype: float64

In [111]:
mapping = dict(df.groupby('Internal Storage')['Expandable Storage'].median())
mapping


{4: nan,
 6: nan,
 8: 64.0,
 16: 128.0,
 32: 256.0,
 64: 512.0,
 128: 1024.0,
 256: 1024.0,
 512: 512.0,
 1024: nan}

In [112]:
new_storage = df['Internal Storage'].apply(lambda x: mapping[x])
df['Expandable Storage'].fillna(new_storage, inplace=True)
df['Expandable Storage'].isna().sum()


32

In [113]:
df['Expandable Storage'].fillna(df['Expandable Storage'].median(), inplace=True)
df['Expandable Storage'] = df['Expandable Storage'].astype(int)
df['Expandable Storage'].isna().sum()


0

### Final Data

- We check for any remaining missing values.
- We save the preprocessed dataset as 'prep_smartphones.csv' for further analysis and modeling.

In [114]:
df.isna().sum().sum()


0

In [115]:
df.isna().sum()


Brand                  0
Price                  0
Color                  0
SIM Type               0
Hybrid Sim Slot        0
Display Size (cm)      0
Resolution Type        0
Display Type           0
Processor Type         0
Internal Storage       0
Primary Camera         0
Secondary Camera       0
Network Type           0
Bluetooth Version      0
Battery Capacity       0
Width                  0
Height                 0
Depth                  0
Weight                 0
Quick Charging         0
Processor Core         0
Primary Clock Speed    0
Audio Jack             0
RAM                    0
Expandable Storage     0
Pixel Width            0
Pixel Height           0
dtype: int64

In [116]:
len(df.select_dtypes('object').columns)


0

In [117]:
len(df.select_dtypes('number').columns)


16

In [118]:
len(df.select_dtypes('category').columns)


11

In [119]:
df.to_csv('prep_smartphones.csv', index=False)
