# Data Assessing

In [1]:
import numpy as np
import pandas as pd

In [4]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [2]:
smartphones = pd.read_csv('smartphones.csv')

In [3]:
smartphones

Unnamed: 0,model_name,price,expert_rating,user_rating,processor,rear_cameras,front_cameras,display,ram_internal_memory,battery,operating_system,additional_features,review,review_link
0,vivo V40,34999,,,Snapdragon 7 Gen 3 Octa core,50+50 MP Rear Camera,50 MP Front Camera,6.78″ (17.22 cm) 120Hz AMOLED Display,8 GB RAM | 128 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor,,
1,OPPO K12x,12999,,,MediaTek Dimensity 6300 Octa core,32+2 MP Rear Camera,8 MP Front Camera,6.67″ (16.94 cm) 120Hz LCD Display,6 GB RAM | 128 GB Storage,5100 mAh | 45W Fast Charging,Android v14 OS,5G | Fingerprint Sensor,,
2,vivo V40 Pro,49999,8.2,,MediaTek Dimensity 9200 Plus Octa core,50+50+50 MP Rear Camera,50 MP Front Camera,6.78″ (17.22 cm) 120Hz AMOLED Display,8 GB RAM | 256 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor,"The Vivo V40 Pro shines with its cameras, perf...",https://www.mysmartprice.com/gear/mobiles/mobi...
3,Motorola Edge 50 Fusion,24879,8.1,4.5 ★\n16.9K Ratings,Snapdragon 7s Gen 2 Octa core,50+13 MP Rear Camera,32 MP Front Camera,6.67″ (16.94 cm) 144Hz P-OLED Display,8 GB RAM | 128 GB Storage,5000 mAh | 68W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor,Moto Edge 50 Fusion has a premium design and s...,https://www.mysmartprice.com/gear/mobiles/mobi...
4,Samsung Galaxy M35 5G,19999,,4.3 ★\n383 Ratings,Samsung Exynos 1380 Octa core,50+8+2 MP Rear Camera,13 MP Front Camera,6.6″ (16.76 cm) 120Hz Super AMOLED Display,6 GB RAM | 128 GB Storage,6000 mAh | 25W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2923,LG K8 2017,8999,,3.7 ★\n80 Ratings,MediaTek,13 MP Rear Camera,5 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,2 GB RAM | 16 GB Storage,2500 mAh,Android v6.0 OS,,,
2924,Honor 6X 32GB,11999,8.3,4.5 ★\n35.9K Ratings,HiSilicon Kirin Octa core,12+2 MP Rear Camera,8 MP Front Camera,5.5″ (13.97 cm) IPS LCD Display,3 GB RAM | 32 GB Storage,3340 mAh,Android v6.0.1 OS,Fingerprint Sensor,,
2925,Videocon Delite 21,6153,,3.0 ★\n772 Ratings,MediaTek Quad core,8 MP Rear Camera,2 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,2 GB RAM | 16 GB Storage,3000 mAh,Android v6.0 OS,,,
2926,Karbonn Titanium Vista 4G,5986,,3.8 ★\n195 Ratings,MediaTek Quad core,8 MP Rear Camera,5 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,1 GB RAM | 8 GB Storage,2300 mAh,Android v6.0 OS,,,


### Quality Issues

1. model_name - some mobiles names contains variant info. `consistency`
2. model_name - some motorola phones only has model name not brand name. `validity`
3. price - has ',' between numbers. `validity`
4. expert_rating - missing values. `completeness`
5. user_rating - missing values. `completeness`
6. processor - missing values. `completeness`
7. processor - some values has only core or processor brand info. `accuracy`
8. rear_camera - multiple cameras all are in same cell, represented using +. `validity`
9. front_camera - missing values. `completeness`
10. display - some values only contains type of display(LCD) `accuracy`
11. ram_internal_memory - missing values. `completeness`
12. ram_internal_memory - some values has only storage info. `accuracy`
13. battery - some values only has battery info not charging. `accuracy`
14. operating_system - missing values. `completeness`
15. additional_features - missing values. `completeness`
16. review and review_link - missing values. `completeness`
17. review - contains incomplete reviews. `validity`

### Tidiness Issues
1. user_rating - can be split into 2 cols user_rating and num_ratings.
2. processor - can be split into processor name, processor brand, cores.
3. rear_cameras and front_cameras - need to do something.
4. display - can be split into size, refresh rate and type of display.
5. ram_internal_memory - can be split into RAM and internal memory.
6. battery - can be split into battery capacity and charging.
7. additional_features - can be split into has_5g, has_nfc, has_fingerprint_sensor
8. review and review_link shouldn't be exist, they are irrelevant for further processing.
9. model_name - another col 'brand_name' can be created.
10. operating_system - split into two cols os_type, os_version

Creating a copy of DataFrame so that it will be easy to revert back to original DF.

In [23]:
df = smartphones.copy()

In [27]:
df.shape

(2928, 14)

Checking Data types

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2928 entries, 0 to 2927
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   model_name           2928 non-null   object 
 1   price                2928 non-null   object 
 2   expert_rating        1827 non-null   float64
 3   user_rating          2832 non-null   object 
 4   processor            2888 non-null   object 
 5   rear_cameras         2928 non-null   object 
 6   front_cameras        2895 non-null   object 
 7   display              2928 non-null   object 
 8   ram_internal_memory  2926 non-null   object 
 9   battery              2928 non-null   object 
 10  operating_system     2919 non-null   object 
 11  additional_features  2312 non-null   object 
 12  review               1419 non-null   object 
 13  review_link          1419 non-null   object 
dtypes: float64(1), object(13)
memory usage: 320.4+ KB


In [30]:
df.describe()

Unnamed: 0,expert_rating
count,1827.0
mean,7.806951
std,0.600576
min,3.5
25%,7.5
50%,7.8
75%,8.1
max,9.5


Checking duplicate values and removing them

In [31]:
df.duplicated().sum()

672

In [32]:
df[df.duplicated()]

Unnamed: 0,model_name,price,expert_rating,user_rating,processor,rear_cameras,front_cameras,display,ram_internal_memory,battery,operating_system,additional_features,review,review_link
48,vivo V40,34999,,,Snapdragon 7 Gen 3 Octa core,50+50 MP Rear Camera,50 MP Front Camera,6.78″ (17.22 cm) 120Hz AMOLED Display,8 GB RAM | 128 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor,,
49,OPPO K12x,12999,,,MediaTek Dimensity 6300 Octa core,32+2 MP Rear Camera,8 MP Front Camera,6.67″ (16.94 cm) 120Hz LCD Display,6 GB RAM | 128 GB Storage,5100 mAh | 45W Fast Charging,Android v14 OS,5G | Fingerprint Sensor,,
50,vivo V40 Pro,49999,8.2,,MediaTek Dimensity 9200 Plus Octa core,50+50+50 MP Rear Camera,50 MP Front Camera,6.78″ (17.22 cm) 120Hz AMOLED Display,8 GB RAM | 256 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor,"The Vivo V40 Pro shines with its cameras, perf...",https://www.mysmartprice.com/gear/mobiles/mobi...
51,Motorola Edge 50 Fusion,24879,8.1,4.5 ★\n16.9K Ratings,Snapdragon 7s Gen 2 Octa core,50+13 MP Rear Camera,32 MP Front Camera,6.67″ (16.94 cm) 144Hz P-OLED Display,8 GB RAM | 128 GB Storage,5000 mAh | 68W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor,Moto Edge 50 Fusion has a premium design and s...,https://www.mysmartprice.com/gear/mobiles/mobi...
52,Samsung Galaxy M35 5G,19999,,4.3 ★\n383 Ratings,Samsung Exynos 1380 Octa core,50+8+2 MP Rear Camera,13 MP Front Camera,6.6″ (16.76 cm) 120Hz Super AMOLED Display,6 GB RAM | 128 GB Storage,6000 mAh | 25W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2923,LG K8 2017,8999,,3.7 ★\n80 Ratings,MediaTek,13 MP Rear Camera,5 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,2 GB RAM | 16 GB Storage,2500 mAh,Android v6.0 OS,,,
2924,Honor 6X 32GB,11999,8.3,4.5 ★\n35.9K Ratings,HiSilicon Kirin Octa core,12+2 MP Rear Camera,8 MP Front Camera,5.5″ (13.97 cm) IPS LCD Display,3 GB RAM | 32 GB Storage,3340 mAh,Android v6.0.1 OS,Fingerprint Sensor,,
2925,Videocon Delite 21,6153,,3.0 ★\n772 Ratings,MediaTek Quad core,8 MP Rear Camera,2 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,2 GB RAM | 16 GB Storage,3000 mAh,Android v6.0 OS,,,
2926,Karbonn Titanium Vista 4G,5986,,3.8 ★\n195 Ratings,MediaTek Quad core,8 MP Rear Camera,5 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,1 GB RAM | 8 GB Storage,2300 mAh,Android v6.0 OS,,,


In [33]:
# code
df.drop_duplicates(inplace=True)

In [34]:
# test
df.duplicated().sum()

0

In [36]:
df.shape

(2256, 14)

### According to the order of severity, we follow the Data cleaning Order.


1. Quality -> Completeness
2. Tidiness
3. Quality -> Validity
4. Quality -> Accuracy
5. Quality -> Consistency

#### Steps involved in Data cleaning
- Define
- Code
- Test

## Fetching rows having more missing values
since they seems to be outliers, so dropping them or filling them will help in maintaining quality of data

In [37]:
df.isna().sum()

model_name                0
price                     0
expert_rating           858
user_rating              71
processor                37
rear_cameras              0
front_cameras            32
display                   0
ram_internal_memory       2
battery                   0
operating_system          9
additional_features     502
review                 1186
review_link            1186
dtype: int64

In [40]:
df[df[['user_rating', 'processor', 'front_cameras', 'ram_internal_memory', 'operating_system']].isnull().all(axis=1)]

Unnamed: 0,model_name,price,expert_rating,user_rating,processor,rear_cameras,front_cameras,display,ram_internal_memory,battery,operating_system,additional_features,review,review_link
2417,Blackberry Curve 8900,18990,,,,3.2 MP Rear Camera,,LCD Display,,1400 mAh Battery,,,,


In [42]:
df.drop(2417, inplace=True)

In [46]:
df[df[['processor', 'front_cameras', 'operating_system']].isnull().all(axis=1)]

Unnamed: 0,model_name,price,expert_rating,user_rating,processor,rear_cameras,front_cameras,display,ram_internal_memory,battery,operating_system,additional_features,review,review_link
1598,Nokia Asha 311,5049,,4.4 ★\n902 Ratings,,3.2 MP Rear Camera,,3″ (7.62 cm) TFT Display,128 MB RAM | 256 MB Storage,1110 mAh Battery,,,,
2036,Nokia Asha 210,5659,,"3.8 ★\n1,727 Ratings",,2 MP Rear Camera,,2.4″ (6.1 cm) TFT Display,32 MB RAM | 64 MB Storage,1200 mAh Battery,,,,
2416,Blackberry Curve 3G 9300,10200,,2.5 ★\n642 Ratings,,2 MP Rear Camera,,2.4″ (6.1 cm) LCD Display,256 MB Storage,1150 mAh Battery,,,,
2418,Nokia 6700 Slide,11049,,,,5 MP Rear Camera,,2.2″ (5.59 cm) LCD Display,60 MB Storage,860 mAh Battery,,,,
2419,Nokia C3,9999,,3.1 ★\n136 Ratings,,2 MP Rear Camera,,2.4″ (6.1 cm) LCD Display,55 MB Storage,1320 mAh Battery,,,,
2442,Nokia 301,5675,,"1.0 ★\n4,807 Ratings",,3.2 MP Rear Camera,,2.4″ (6.1 cm) TFT Display,64 MB RAM | 256 MB Storage,1200 mAh Battery,,,,
2456,Karbonn A18 Plus,5314,,3.3 ★\n575 Ratings,,5 MP Rear Camera,,5.0″ (12.7 cm) IPS LCD Display,512 MB RAM | 4 GB Storage,2000 mAh Battery,,,,


In [47]:
df.drop([1598, 2036, 2416, 2418, 2419, 2442, 2456], inplace=True)

In [48]:
df[df[['processor', 'front_cameras', 'operating_system']].isnull().all(axis=1)]

Unnamed: 0,model_name,price,expert_rating,user_rating,processor,rear_cameras,front_cameras,display,ram_internal_memory,battery,operating_system,additional_features,review,review_link


In [49]:
df.isna().sum()

model_name                0
price                     0
expert_rating           850
user_rating              69
processor                29
rear_cameras              0
front_cameras            24
display                   0
ram_internal_memory       1
battery                   0
operating_system          1
additional_features     494
review                 1178
review_link            1178
dtype: int64

In [51]:
df[df[['operating_system', 'ram_internal_memory']].isnull().any(axis=1)]

Unnamed: 0,model_name,price,expert_rating,user_rating,processor,rear_cameras,front_cameras,display,ram_internal_memory,battery,operating_system,additional_features,review,review_link
2434,Lava Iris 502,9999,,2.0 ★\n96 Ratings,,5 MP Rear Camera,0.3 MP Front Camera,5″ (12.7 cm) LCD Display,512 MB RAM | 4 GB Storage,2000 mAh Battery,,,,
2447,Micromax Bling 3 A86,6000,,3.5 ★\n32 Ratings,,5 MP Rear Camera,0.3 MP Front Camera,4″ (10.16 cm) TFT Display,,1600 mAh Battery,Android v4.1 OS,,,


In [54]:
df.loc[2434, 'operating_system'] =  "Android v4.1 OS"
df.loc[2434, 'processor'] = 'Dual Core'

In [61]:
df.loc[2447, 'processor'] = 'Dual Core'
df.loc[2447, 'ram_internal_memory'] = '512 MB RAM'

## Let's Start handling issues One by one

expert_rating - missing values. `completeness`

According to me, the better way to fill missing values in expert_rating column is by using knn imputer, but knn imputer requires number columns and since we are just starting, we will fill them later.


user_rating - missing values. `completeness`

we can't predict that how many users will rate a specific mobile, so filling with 0 is a option but on further processing it will create chaos. so as like expert ratings, we will use knn imputer, but for that firstly we have to change it to numeric columns

In [114]:
df['user_rating'].replace('0', np.nan)

0                          0
1                          0
2                          0
3       4.5 ★\n16.9K Ratings
4         4.3 ★\n383 Ratings
                ...         
2875       3.7 ★\n80 Ratings
2876    4.5 ★\n35.9K Ratings
2877      3.0 ★\n772 Ratings
2878      3.8 ★\n195 Ratings
2879    3.9 ★\n1,253 Ratings
Name: user_rating, Length: 2248, dtype: object

processor - missing values. `completeness`

Filling missing values with 'unknown' should be preferred. 

In [67]:
df['processor'].isna().sum()

28

In [77]:
df.loc[[1597, 2037, 2053, 2420, 2421, 2422, 2423, 2424, 2425, 2426], 'processor'] = 'Single Core'
df.loc[[2280, 2406, 2429, 2434, 2444, 2457, 2459, 2463, 2468, 2472, 2489, 2495], 'processor'] = 'Dual Core'
df.loc[[2523, 2538, 2625, 2627, 2666, 2717], 'processor'] = 'Quad Core'

In [78]:
# test
df['processor'].isna().sum()

0

front_camera - missing values. `completeness`

Filling values by using chatgpt.

In [79]:
df['front_cameras'].isna().sum()

24

In [81]:
df[df['front_cameras'].isna()]

Unnamed: 0,model_name,price,expert_rating,user_rating,processor,rear_cameras,front_cameras,display,ram_internal_memory,battery,operating_system,additional_features,review,review_link
1414,Sony Xperia tipo,6299,,"4.8 ★\n2,293 Ratings",Snapdragon,3.2 MP Rear Camera,,3.2″ (8.13 cm) TFT Display,512 MB RAM | 2.9 GB Storage,1500 mAh Battery,Android v4.0.4 OS,,,
1549,Blackberry Bold 9780,27990,,3.6 ★\n59 Ratings,Marvell,5 MP Rear Camera,,2.4″ (6.1 cm) TFT Display,512 MB RAM | 256 MB Storage,1500 mAh Battery,Blackberry v6 OS,,,
1597,Blackberry Pearl 3G 9105,20990,,3.1 ★\n94 Ratings,Single Core,3.2 MP Rear Camera,,2.25″ (5.72 cm) TFT Display,256 MB Storage,1150 mAh Battery,Blackberry v5.0.x OS,,,
1624,Intex Aqua Twist,5590,,3.7 ★\n86 Ratings,MediaTek Quad core,5 MP Rear Camera,,5.0″ (12.7 cm) IPS LCD Display,1 GB RAM | 8 GB Storage,2200 mAh,Android v5.1 OS,,,
2037,Blackberry Curve 8520,10900,,4.0 ★\n498 Ratings,Single Core,2 MP Rear Camera,,2.4″ (6.1 cm) TFT Display,128 MB RAM | 256 MB Storage,1150 mAh Battery,Blackberry v5.0.x OS,,,
2038,Sony Ericsson Xperia X10,35795,,3.9 ★\n32 Ratings,Snapdragon S1,8.1 MP Rear Camera,,4″ (10.16 cm) LCD Display,384 MB RAM | 1 GB Storage,1500 mAh Battery,Android v1.6 OS,,,
2052,Nokia Lumia 800,13299,,4.4 ★\n729 Ratings,Snapdragon S2,8 MP Rear Camera,,3.7″ (9.4 cm) AMOLED Display,512 MB RAM | 16 GB Storage,1450 mAh Battery,Windows Phone v7.5 OS,,,
2053,Blackberry Curve 9320,9799,,"2.5 ★\n2,641 Ratings",Single Core,3.2 MP Rear Camera,,2.44″ (6.2 cm) TFT Display,512 MB RAM | 512 MB Storage,1450 mAh Battery,Blackberry v7.1 OS,,,
2057,Blackberry 9720,15250,7.1,4.0 ★\n385 Ratings,Marvell,5 MP Rear Camera,,2.8″ (7.11 cm) IPS LCD Display,512 MB RAM | 512 MB Storage,1450 mAh Battery,Blackberry v7.1 OS,,,
2128,Nokia 8110 4G,5999,7.0,"1.0 ★\n3,286 Ratings",Snapdragon 205 Dual core,2 MP Rear Camera,,2.45″ (6.22 cm) TFT Display,512 MB RAM | 4 GB Storage,1500 mAh,KAI OS OS,,,


In [89]:
df.loc[[1414, 1549, 1597, 1624, 2037, 2053, 2057, 2128, 2422, 2423, 2425, 2426, 2428, 2436, 2445, 2457], 'front_cameras'] = '0 MP Front Camera'
df.loc[[2038, 2407], 'front_cameras'] = '1.3 MP Front Camera'
df.loc[[2052, 2430, 2431, 2433, 2452], 'front_cameras'] = '0.3 MP Front Camera'
df.loc[[2374], 'front_cameras'] = '20 MP Front Camera'

In [91]:
df.isna().sum()

model_name                0
price                     0
expert_rating           850
user_rating               0
processor                 0
rear_cameras              0
front_cameras             0
display                   0
ram_internal_memory       0
battery                   0
operating_system          0
additional_features     494
review                 1178
review_link            1178
dtype: int64

additional_features - missing values. `completeness`

this column contains that phone has 5g, nfc, or fingerprint sensor. null value means phone doesn't have any of the three features, so we will fill it with "No add features"

In [93]:
df['additional_features'].isna().sum()

494

In [95]:
df.fillna({'additional_features': 'No add features'}, inplace=True)

In [96]:
df['additional_features'].isna().sum()

0

In [97]:
df.isna().sum()

model_name                0
price                     0
expert_rating           850
user_rating               0
processor                 0
rear_cameras              0
front_cameras             0
display                   0
ram_internal_memory       0
battery                   0
operating_system          0
additional_features       0
review                 1178
review_link            1178
dtype: int64

### Handling Tidiness Issues.

Dropping irrelevant columns

In [104]:
df.drop(columns=['review', 'review_link'], inplace=True)

user_rating - can be split into 2 cols user_rating and num_ratings.

missing values can be handled after filling up the missing values, also dtype will be corrected

In [115]:
df['user_rating'].value_counts()

user_rating
0                       69
4.5 ★\n1.3L Ratings      9
4.2 ★\n1.1L Ratings      8
4.2 ★\n1L Ratings        6
4.3 ★\n1.4L Ratings      6
                        ..
4.4 ★\n54K Ratings       1
4.2 ★\n14.9K Ratings     1
4.1 ★\n1,394 Ratings     1
4.3 ★\n2,041 Ratings     1
3.9 ★\n1,253 Ratings     1
Name: count, Length: 1796, dtype: int64

In [120]:
user_ratings = df['user_rating'].str.strip().str.split('\n').str.get(0).str.split(' ').str.get(0)

In [125]:
num_ratings = df['user_rating'].str.strip().str.split('\n').str.get(1).str.split(' ').str.get(0)

In [127]:
df.insert(4, 'avg_user_rating', user_ratings)
df.insert(5, 'num_ratings', num_ratings)

In [128]:
df

Unnamed: 0,model_name,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,rear_cameras,front_cameras,display,ram_internal_memory,battery,operating_system,additional_features
0,vivo V40,34999,,0,,,Snapdragon 7 Gen 3 Octa core,50+50 MP Rear Camera,50 MP Front Camera,6.78″ (17.22 cm) 120Hz AMOLED Display,8 GB RAM | 128 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
1,OPPO K12x,12999,,0,,,MediaTek Dimensity 6300 Octa core,32+2 MP Rear Camera,8 MP Front Camera,6.67″ (16.94 cm) 120Hz LCD Display,6 GB RAM | 128 GB Storage,5100 mAh | 45W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
2,vivo V40 Pro,49999,8.2,0,,,MediaTek Dimensity 9200 Plus Octa core,50+50+50 MP Rear Camera,50 MP Front Camera,6.78″ (17.22 cm) 120Hz AMOLED Display,8 GB RAM | 256 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
3,Motorola Edge 50 Fusion,24879,8.1,4.5 ★\n16.9K Ratings,4.5,16.9K,Snapdragon 7s Gen 2 Octa core,50+13 MP Rear Camera,32 MP Front Camera,6.67″ (16.94 cm) 144Hz P-OLED Display,8 GB RAM | 128 GB Storage,5000 mAh | 68W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor
4,Samsung Galaxy M35 5G,19999,,4.3 ★\n383 Ratings,4.3,383,Samsung Exynos 1380 Octa core,50+8+2 MP Rear Camera,13 MP Front Camera,6.6″ (16.76 cm) 120Hz Super AMOLED Display,6 GB RAM | 128 GB Storage,6000 mAh | 25W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,LG K8 2017,8999,,3.7 ★\n80 Ratings,3.7,80,MediaTek,13 MP Rear Camera,5 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,2 GB RAM | 16 GB Storage,2500 mAh,Android v6.0 OS,No add features
2876,Honor 6X 32GB,11999,8.3,4.5 ★\n35.9K Ratings,4.5,35.9K,HiSilicon Kirin Octa core,12+2 MP Rear Camera,8 MP Front Camera,5.5″ (13.97 cm) IPS LCD Display,3 GB RAM | 32 GB Storage,3340 mAh,Android v6.0.1 OS,Fingerprint Sensor
2877,Videocon Delite 21,6153,,3.0 ★\n772 Ratings,3.0,772,MediaTek Quad core,8 MP Rear Camera,2 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,2 GB RAM | 16 GB Storage,3000 mAh,Android v6.0 OS,No add features
2878,Karbonn Titanium Vista 4G,5986,,3.8 ★\n195 Ratings,3.8,195,MediaTek Quad core,8 MP Rear Camera,5 MP Front Camera,5.0″ (12.7 cm) IPS LCD Display,1 GB RAM | 8 GB Storage,2300 mAh,Android v6.0 OS,No add features


In [151]:
def unit_converter(num):
    lastDigit = str(num)[-1]
    if lastDigit == 'K':
        return float(num[:-1])*1000
    elif lastDigit == 'L':
        return float(num[:-1])*100000
    else:
        return float(str(num).replace(',', ''))

In [153]:
df['num_ratings'] = df['num_ratings'].apply(unit_converter)

processor - can be split into processor name, processor brand, cores.

In [156]:
df['processor'].value_counts()

processor
Unisoc Octa core                92
MediaTek                        83
MediaTek Quad core              79
MediaTek Tru-Octa Core          70
MediaTek Helio G85 Octa core    51
                                ..
Snapdragon 410 Octa core         1
Apple A4                         1
Snapdragon S2 Single core        1
Snapdragon 400 Dual core         1
Snapdragon 415 Tru-Octa Core     1
Name: count, Length: 233, dtype: int64

In [158]:
processor_brand = df['processor'].str.split(' ').str.get(0)

In [162]:
df.columns

Index(['model_name', 'price', 'expert_rating', 'user_rating',
       'avg_user_rating', 'num_ratings', 'processor', 'rear_cameras',
       'front_cameras', 'display', 'ram_internal_memory', 'battery',
       'operating_system', 'additional_features'],
      dtype='object')

In [163]:
df.insert(7, 'processor_brand', processor_brand)

In [170]:
top_brands = df['processor_brand'].value_counts().head().index.tolist()

df['processor_brand'] = df['processor_brand'].apply(lambda x: x if x in top_brands else 'Other')

In [171]:
df['processor'].value_counts()

processor
Unisoc Octa core                92
MediaTek                        83
MediaTek Quad core              79
MediaTek Tru-Octa Core          70
MediaTek Helio G85 Octa core    51
                                ..
Snapdragon 410 Octa core         1
Apple A4                         1
Snapdragon S2 Single core        1
Snapdragon 400 Dual core         1
Snapdragon 415 Tru-Octa Core     1
Name: count, Length: 233, dtype: int64

In [205]:
import re

def extract_cores(processor):
    match = re.search(r'\b(Single|Dual|Quad|Octa|Tru-Octa|Hexa|Deca|Nona)\s+core\b', processor, re.IGNORECASE)
    return match.group(0) if match else 'Other'

cores = df['processor'].apply(extract_cores)

In [210]:
df.insert(8, 'num_cores', cores)

In [212]:
df['num_cores'].value_counts()

num_cores
Octa core        1473
Tru-Octa Core     248
Quad core         211
Other             175
Hexa Core          74
Dual core          25
Dual Core          13
Single Core        10
Deca Core           6
Quad Core           6
Nona Core           5
Single core         2
Name: count, dtype: int64

In [213]:
core_mapping = {
    'Single Core': 1,
    'Single core': 1,
    'Dual Core': 2,
    'Dual core': 2,
    'Quad Core': 4,
    'Quad core': 4,
    'Hexa Core': 6,
    'Octa Core': 8,
    'Octa core': 8,
    'Tru-Octa Core': 8,
    'Deca Core': 10,
    'Nona Core': 9,
    'Other': np.nan
}

df['num_cores'] = df['num_cores'].replace(core_mapping)

  df['num_cores'] = df['num_cores'].replace(core_mapping)


In [214]:
df['num_cores'].value_counts()

num_cores
8.0     1721
4.0      217
6.0       74
2.0       38
1.0       12
10.0       6
9.0        5
Name: count, dtype: int64

In [216]:
df['num_cores'].isna().sum()

175

rear_cameras and front_cameras - need to do something. like one column can be created as total number of rear cameras, 

In [221]:
df['rear_cameras'].value_counts()

rear_cameras
13 MP Rear Camera             257
8 MP Rear Camera              217
50+2 MP Rear Camera           115
5 MP Rear Camera              107
13+2 MP Rear Camera            84
                             ... 
50+50+50+50 MP Rear Camera      1
108+5 MP Rear Camera            1
48+2+2+2 MP Rear Camera         1
108+5+2 MP Rear Camera          1
21.5 MP Rear Camera             1
Name: count, Length: 236, dtype: int64

In [222]:
def handle_cameras(cameras):
    megapixels = [float(mp) for mp in re.findall(r'\d+\.?\d*', cameras)]
    num_cameras = len(megapixels)
    main_camera = max(megapixels)
    return num_cameras, main_camera

num_rear_cameras, main_rear_cameras = zip(*df['rear_cameras'].map(handle_cameras))

In [225]:
df.columns

Index(['model_name', 'price', 'expert_rating', 'user_rating',
       'avg_user_rating', 'num_ratings', 'processor', 'processor_brand',
       'num_cores', 'rear_cameras', 'front_cameras', 'display',
       'ram_internal_memory', 'battery', 'operating_system',
       'additional_features'],
      dtype='object')

In [226]:
df.insert(10, 'num_rear_cameras', num_rear_cameras)

In [227]:
df.insert(11, 'main_rear_camera', main_rear_cameras)

In [230]:
num_front_cameras, main_front_cameras = zip(*df['front_cameras'].map(handle_cameras))

In [237]:
df.insert(13, 'num_front_cameras', num_front_cameras)

In [238]:
df.insert(14, 'main_front_camera', main_front_cameras)

In [239]:
df

Unnamed: 0,model_name,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,rear_cameras,num_rear_cameras,main_rear_camera,front_cameras,num_front_cameras,main_front_camera,display,ram_internal_memory,battery,operating_system,additional_features
0,vivo V40,34999,,0,,,Snapdragon 7 Gen 3 Octa core,Snapdragon,8.0,50+50 MP Rear Camera,2,50.0,50 MP Front Camera,1,50.0,6.78″ (17.22 cm) 120Hz AMOLED Display,8 GB RAM | 128 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
1,OPPO K12x,12999,,0,,,MediaTek Dimensity 6300 Octa core,MediaTek,8.0,32+2 MP Rear Camera,2,32.0,8 MP Front Camera,1,8.0,6.67″ (16.94 cm) 120Hz LCD Display,6 GB RAM | 128 GB Storage,5100 mAh | 45W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
2,vivo V40 Pro,49999,8.2,0,,,MediaTek Dimensity 9200 Plus Octa core,MediaTek,8.0,50+50+50 MP Rear Camera,3,50.0,50 MP Front Camera,1,50.0,6.78″ (17.22 cm) 120Hz AMOLED Display,8 GB RAM | 256 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
3,Motorola Edge 50 Fusion,24879,8.1,4.5 ★\n16.9K Ratings,4.5,16900.0,Snapdragon 7s Gen 2 Octa core,Snapdragon,8.0,50+13 MP Rear Camera,2,50.0,32 MP Front Camera,1,32.0,6.67″ (16.94 cm) 144Hz P-OLED Display,8 GB RAM | 128 GB Storage,5000 mAh | 68W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor
4,Samsung Galaxy M35 5G,19999,,4.3 ★\n383 Ratings,4.3,383.0,Samsung Exynos 1380 Octa core,Samsung,8.0,50+8+2 MP Rear Camera,3,50.0,13 MP Front Camera,1,13.0,6.6″ (16.76 cm) 120Hz Super AMOLED Display,6 GB RAM | 128 GB Storage,6000 mAh | 25W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,LG K8 2017,8999,,3.7 ★\n80 Ratings,3.7,80.0,MediaTek,MediaTek,,13 MP Rear Camera,1,13.0,5 MP Front Camera,1,5.0,5.0″ (12.7 cm) IPS LCD Display,2 GB RAM | 16 GB Storage,2500 mAh,Android v6.0 OS,No add features
2876,Honor 6X 32GB,11999,8.3,4.5 ★\n35.9K Ratings,4.5,35900.0,HiSilicon Kirin Octa core,Other,8.0,12+2 MP Rear Camera,2,12.0,8 MP Front Camera,1,8.0,5.5″ (13.97 cm) IPS LCD Display,3 GB RAM | 32 GB Storage,3340 mAh,Android v6.0.1 OS,Fingerprint Sensor
2877,Videocon Delite 21,6153,,3.0 ★\n772 Ratings,3.0,772.0,MediaTek Quad core,MediaTek,4.0,8 MP Rear Camera,1,8.0,2 MP Front Camera,1,2.0,5.0″ (12.7 cm) IPS LCD Display,2 GB RAM | 16 GB Storage,3000 mAh,Android v6.0 OS,No add features
2878,Karbonn Titanium Vista 4G,5986,,3.8 ★\n195 Ratings,3.8,195.0,MediaTek Quad core,MediaTek,4.0,8 MP Rear Camera,1,8.0,5 MP Front Camera,1,5.0,5.0″ (12.7 cm) IPS LCD Display,1 GB RAM | 8 GB Storage,2300 mAh,Android v6.0 OS,No add features


display - can be split into size, refresh rate and type of display.

In [240]:
df['display'].value_counts()

display
5.0″ (12.7 cm) IPS LCD Display                 160
5.5″ (13.97 cm) IPS LCD Display                124
6.67″ (16.94 cm) 120Hz AMOLED Display           77
6.78″ (17.22 cm) 120Hz AMOLED Display           51
6.6″ (16.76 cm) 90Hz IPS LCD Display            40
                                              ... 
5.15″ (13.08 cm) IPS LCD Display                 1
5.8″ (14.73 cm) 60Hz Dynamic AMOLED Display      1
6.9″ (17.53 cm) 144Hz P-OLED Display             1
2.25″ (5.72 cm) TFT Display                      1
6.2″ (15.75 cm) P-OLED Display                   1
Name: count, Length: 370, dtype: int64

In [250]:
display_size = df['display'].str.split(' ').str.get(0).str.replace('″', '')

In [252]:
df.insert(16, 'display_size', display_size)

In [259]:
refresh_rate = df['display'].str.split(' ').str.get(3)

In [260]:
df.insert(17, 'refresh_rate', refresh_rate)

In [290]:
display_type = df['display'].str.split(' ').str.get(-3) + ' ' + df['display'].str.split(' ').str.get(-2)

In [291]:
display_type.value_counts()

display
IPS LCD               1153
Super AMOLED           228
120Hz AMOLED           211
cm) TFT                 85
cm) AMOLED              59
90Hz AMOLED             54
AMOLED 2x               43
Fluid AMOLED            41
120Hz OLED              32
cm) LCD                 31
TFT LCD                 29
PLS LCD                 27
60Hz OLED               26
cm) OLED                22
60Hz AMOLED             21
AMOLED Plus             20
Dynamic AMOLED          18
144Hz AMOLED            15
144Hz P-OLED            14
120Hz LCD               13
Flexible AMOLED         12
90Hz LCD                10
LTPO AMOLED              9
120Hz P-OLED             7
Optic AMOLED             7
90Hz OLED                7
cm) S-LCD                7
cm) P-OLED               5
90Hz TFT                 4
S-LCD 3                  4
LTPS LCD                 4
S-LCD 5                  3
IPS LED                  3
60Hz TFT                 3
S-LCD 2                  3
TFT LTPS                 3
120Hz TFT           

In [292]:
df.insert(18, 'display_type', display_type)

In [293]:
df

Unnamed: 0,model_name,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,rear_cameras,...,num_front_cameras,main_front_camera,display,display_size,refresh_rate,display_type,ram_internal_memory,battery,operating_system,additional_features
0,vivo V40,34999,,0,,,Snapdragon 7 Gen 3 Octa core,Snapdragon,8.0,50+50 MP Rear Camera,...,1,50.0,6.78″ (17.22 cm) 120Hz AMOLED Display,6.78,120Hz,120Hz AMOLED,8 GB RAM | 128 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
1,OPPO K12x,12999,,0,,,MediaTek Dimensity 6300 Octa core,MediaTek,8.0,32+2 MP Rear Camera,...,1,8.0,6.67″ (16.94 cm) 120Hz LCD Display,6.67,120Hz,120Hz LCD,6 GB RAM | 128 GB Storage,5100 mAh | 45W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
2,vivo V40 Pro,49999,8.2,0,,,MediaTek Dimensity 9200 Plus Octa core,MediaTek,8.0,50+50+50 MP Rear Camera,...,1,50.0,6.78″ (17.22 cm) 120Hz AMOLED Display,6.78,120Hz,120Hz AMOLED,8 GB RAM | 256 GB Storage,5500 mAh | 80W Fast Charging,Android v14 OS,5G | Fingerprint Sensor
3,Motorola Edge 50 Fusion,24879,8.1,4.5 ★\n16.9K Ratings,4.5,16900.0,Snapdragon 7s Gen 2 Octa core,Snapdragon,8.0,50+13 MP Rear Camera,...,1,32.0,6.67″ (16.94 cm) 144Hz P-OLED Display,6.67,144Hz,144Hz P-OLED,8 GB RAM | 128 GB Storage,5000 mAh | 68W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor
4,Samsung Galaxy M35 5G,19999,,4.3 ★\n383 Ratings,4.3,383.0,Samsung Exynos 1380 Octa core,Samsung,8.0,50+8+2 MP Rear Camera,...,1,13.0,6.6″ (16.76 cm) 120Hz Super AMOLED Display,6.6,120Hz,Super AMOLED,6 GB RAM | 128 GB Storage,6000 mAh | 25W Fast Charging,Android v14 OS,5G | NFC | Fingerprint Sensor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,LG K8 2017,8999,,3.7 ★\n80 Ratings,3.7,80.0,MediaTek,MediaTek,,13 MP Rear Camera,...,1,5.0,5.0″ (12.7 cm) IPS LCD Display,5.0,IPS,IPS LCD,2 GB RAM | 16 GB Storage,2500 mAh,Android v6.0 OS,No add features
2876,Honor 6X 32GB,11999,8.3,4.5 ★\n35.9K Ratings,4.5,35900.0,HiSilicon Kirin Octa core,Other,8.0,12+2 MP Rear Camera,...,1,8.0,5.5″ (13.97 cm) IPS LCD Display,5.5,IPS,IPS LCD,3 GB RAM | 32 GB Storage,3340 mAh,Android v6.0.1 OS,Fingerprint Sensor
2877,Videocon Delite 21,6153,,3.0 ★\n772 Ratings,3.0,772.0,MediaTek Quad core,MediaTek,4.0,8 MP Rear Camera,...,1,2.0,5.0″ (12.7 cm) IPS LCD Display,5.0,IPS,IPS LCD,2 GB RAM | 16 GB Storage,3000 mAh,Android v6.0 OS,No add features
2878,Karbonn Titanium Vista 4G,5986,,3.8 ★\n195 Ratings,3.8,195.0,MediaTek Quad core,MediaTek,4.0,8 MP Rear Camera,...,1,5.0,5.0″ (12.7 cm) IPS LCD Display,5.0,IPS,IPS LCD,1 GB RAM | 8 GB Storage,2300 mAh,Android v6.0 OS,No add features


In [295]:
df['display_type'].value_counts().index.tolist()

['IPS LCD',
 'Super AMOLED',
 '120Hz AMOLED',
 'cm) TFT',
 'cm) AMOLED',
 '90Hz AMOLED',
 'AMOLED 2x',
 'Fluid AMOLED',
 '120Hz OLED',
 'cm) LCD',
 'TFT LCD',
 'PLS LCD',
 '60Hz OLED',
 'cm) OLED',
 '60Hz AMOLED',
 'AMOLED Plus',
 'Dynamic AMOLED',
 '144Hz AMOLED',
 '144Hz P-OLED',
 '120Hz LCD',
 'Flexible AMOLED',
 '90Hz LCD',
 'LTPO AMOLED',
 '120Hz P-OLED',
 'Optic AMOLED',
 '90Hz OLED',
 'cm) S-LCD',
 'cm) P-OLED',
 '90Hz TFT',
 'S-LCD 3',
 'LTPS LCD',
 'S-LCD 5',
 'IPS LED',
 '60Hz TFT',
 'S-LCD 2',
 'TFT LTPS',
 '120Hz TFT',
 'Flexi-fluid AMOLED',
 '165Hz P-OLED',
 'Plus LCD',
 '(8.89 cm)',
 'S-LCD 6',
 '(13.97 cm)',
 '90Hz P-OLED',
 '60Hz LCD',
 '144Hz OLED',
 'LTPS AMOLED',
 '60Hz P-OLED']

In [340]:
def display_type_categorize(display_type):
    if 'IPS LCD' in display_type:
        return 'IPS LCD'
    elif 'TFT LCD' in display_type:
        return 'TFT LCD'
    elif 'P-OLED' in display_type:
        return 'P-OLED'
    elif 'AMOLED' in display_type and 'Super AMOLED' not in display_type:
        return 'AMOLED'
    elif 'Super AMOLED' in display_type:
        return 'Super AMOLED'
    elif 'LED' in display_type:
        return 'OLED'
    elif 'LCD' in display_type:
        return 'IPS LCD'
    elif 'TFT' in display_type:
        return 'TFT LCD'
    else:
        return 'IPS LCD'

In [341]:
df['display_type'] = df['display_type'].apply(display_type_categorize)

In [342]:
df['display_type'].value_counts()

display_type
IPS LCD         1260
AMOLED           513
Super AMOLED     228
TFT LCD          126
OLED              91
P-OLED            30
Name: count, dtype: int64

In [343]:
df['display_type'].isna().sum()

0

ram_internal_memory - can be split into RAM and internal memory.

In [345]:
df['ram_internal_memory'].value_counts()

ram_internal_memory
8 GB RAM | 128 GB Storage      316
4 GB RAM | 64 GB Storage       292
6 GB RAM | 128 GB Storage      271
3 GB RAM | 32 GB Storage       172
8 GB RAM | 256 GB Storage      158
                              ... 
128 MB RAM | 50 MB Storage       1
512 MB RAM | 256 MB Storage      1
16 GB RAM | 1 TB Storage         1
384 MB RAM | 1 GB Storage        1
2 GB RAM | 256 GB Storage        1
Name: count, Length: 62, dtype: int64

In [389]:
# can't extract by this way because some mobiles has only storage and no ram value
df['ram_internal_memory'].str.split('|').str.get(0).str.strip().str.split(' ').loc[2421]

['200', 'MB', 'Storage']

In [391]:
ram = df['ram_internal_memory'].str.extract(r'(\d+ MB|\d+ GB) RAM')[0]
storage = df['ram_internal_memory'].str.extract(r'(\d+ MB|\d+ GB|\d+ TB) Storage')[0]

In [396]:
df.insert(20, 'ram', ram)
df.insert(21, 'storage', storage)

In [399]:
df['ram'].value_counts()

ram
8 GB      481
4 GB      465
6 GB      358
3 GB      261
2 GB      232
1 GB      190
12 GB     173
512 MB     46
16 GB      14
5 GB        9
256 MB      5
128 MB      4
768 MB      3
18 GB       2
10 GB       1
384 MB      1
Name: count, dtype: int64

In [400]:
df['ram'].isna().sum()

3

In [408]:
df[df['ram'].isna()]

Unnamed: 0,model_name,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,rear_cameras,...,display,display_size,refresh_rate,display_type,ram_internal_memory,ram,storage,battery,operating_system,additional_features
1597,Blackberry Pearl 3G 9105,20990,,3.1 ★\n94 Ratings,3.1,94.0,Single Core,Other,1.0,3.2 MP Rear Camera,...,2.25″ (5.72 cm) TFT Display,2.25,TFT,TFT LCD,256 MB Storage,,256 MB,1150 mAh Battery,Blackberry v5.0.x OS,No add features
2421,Nokia C6,9655,,2.3 ★\n106 Ratings,2.3,106.0,Single Core,Other,1.0,5 MP Rear Camera,...,3.2″ (8.13 cm) LCD Display,3.2,LCD,IPS LCD,200 MB Storage,,200 MB,1200 mAh Battery,Symbian OS,No add features
2422,Nokia E5,8231,,5.0 ★\n366 Ratings,5.0,366.0,Single Core,Other,1.0,5 MP Rear Camera,...,2.4″ (6.1 cm) LCD Display,2.4,LCD,IPS LCD,256 MB Storage,,256 MB,1200 mAh Battery,Symbian OS,No add features


In [410]:
df.loc[[1597, 2421, 2422], 'ram'] = '256 MB'

In [411]:
df['ram'].isna().sum()

0

In [401]:
df['storage'].value_counts()

storage
128 GB    718
64 GB     410
256 GB    322
32 GB     267
16 GB     215
8 GB      160
4 GB       65
512 GB     57
1 TB       12
1 GB        5
256 MB      5
512 MB      5
9 GB        2
50 MB       1
200 MB      1
40 MB       1
157 MB      1
Name: count, dtype: int64

In [402]:
df['storage'].isna().sum()

1

In [403]:
df[df['storage'].isna()]

Unnamed: 0,model_name,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,rear_cameras,...,display,display_size,refresh_rate,display_type,ram_internal_memory,ram,storage,battery,operating_system,additional_features
2447,Micromax Bling 3 A86,6000,,3.5 ★\n32 Ratings,3.5,32.0,Dual Core,Other,2.0,5 MP Rear Camera,...,4″ (10.16 cm) TFT Display,4,TFT,TFT LCD,512 MB RAM,512 MB,,1600 mAh Battery,Android v4.1 OS,No add features


In [406]:
df.loc[2447, 'storage'] = '512 MB'

In [397]:
def convert_to_gb(storage):
    if 'TB' in storage:
        return float(storage.split()[0]) * 1024
    elif 'GB' in storage:
        return float(storage.split()[0])
    elif 'MB' in storage:
        return float(storage.split()[0]) / 1024
    return 0

In [414]:
df['ram'] = df['ram'].apply(convert_to_gb)
df['storage'] = df['storage'].apply(convert_to_gb)

battery - can be split into battery capacity and charging.

In [419]:
df['battery'].value_counts()

battery
5000 mAh | 18W Fast Charging    145
5000 mAh | 33W Fast Charging    121
5000 mAh | 10W                  119
3000 mAh                         94
4000 mAh                         68
                               ... 
6300 mAh                          1
4350 mAh | 65W Fast Charging      1
3095 mAh | 20W Fast Charging      1
6300 mAh Battery                  1
2610 mAh                          1
Name: count, Length: 406, dtype: int64

In [425]:
battery_capacity = df['battery'].str.split('|').str.get(0).str.strip().str.split(' ').str.get(0)

In [440]:
df.insert(23, 'battery_capacity', battery_capacity)

In [429]:
df['battery_capacity'].value_counts()

battery_capacity
5000    751
4000    138
3000    119
6000    118
4500    104
       ... 
5700      1
2850      1
4492      1
4440      1
2610      1
Name: count, Length: 221, dtype: int64

In [430]:
df['battery_capacity'].isna().sum()

0

In [445]:
charging = df['battery'].str.split('|').str.get(1).str.strip().str.split(' ').str.get(0).str.replace('W', '')

In [448]:
df.insert(24, 'charging', charging)

In [450]:
df['charging'].value_counts()

charging
18      207
33      176
10      156
25      115
15      102
20       77
67       69
30       68
45       61
44       49
80       42
65       42
120      42
66       29
100      23
68       10
50       10
27        7
90        6
150       6
55        6
125       5
22.5      4
70        3
60        2
35        1
180       1
24        1
7.5       1
Name: count, dtype: int64

In [451]:
df['charging'].isna().sum()

927

additional_features - can be split into has_5g, has_nfc, has_fingerprint_sensor

In [453]:
df['additional_features'].value_counts()

additional_features
Fingerprint Sensor               737
No add features                  494
5G | Fingerprint Sensor          340
5G | NFC | Fingerprint Sensor    331
NFC | Fingerprint Sensor         234
NFC                               68
5G | NFC                          44
Name: count, dtype: int64

In [460]:
def add_feature_extract(value):
    has_fingerprint_sensor = 'Fingerprint Sensor' in value
    has_5g = '5G' in value
    has_nfc = 'NFC' in value
    
    return pd.Series([has_fingerprint_sensor, has_5g, has_nfc])

In [461]:
df[['has_fingerprint', 'has_5g', 'has_nfc']] = df['additional_features'].apply(add_feature_extract)

In [463]:
df

Unnamed: 0,model_name,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,rear_cameras,...,ram,storage,battery,battery_capacity,charging,operating_system,additional_features,has_fingerprint,has_5g,has_nfc
0,vivo V40,34999,,0,,,Snapdragon 7 Gen 3 Octa core,Snapdragon,8.0,50+50 MP Rear Camera,...,8.0,128.0,5500 mAh | 80W Fast Charging,5500,80,Android v14 OS,5G | Fingerprint Sensor,True,True,False
1,OPPO K12x,12999,,0,,,MediaTek Dimensity 6300 Octa core,MediaTek,8.0,32+2 MP Rear Camera,...,6.0,128.0,5100 mAh | 45W Fast Charging,5100,45,Android v14 OS,5G | Fingerprint Sensor,True,True,False
2,vivo V40 Pro,49999,8.2,0,,,MediaTek Dimensity 9200 Plus Octa core,MediaTek,8.0,50+50+50 MP Rear Camera,...,8.0,256.0,5500 mAh | 80W Fast Charging,5500,80,Android v14 OS,5G | Fingerprint Sensor,True,True,False
3,Motorola Edge 50 Fusion,24879,8.1,4.5 ★\n16.9K Ratings,4.5,16900.0,Snapdragon 7s Gen 2 Octa core,Snapdragon,8.0,50+13 MP Rear Camera,...,8.0,128.0,5000 mAh | 68W Fast Charging,5000,68,Android v14 OS,5G | NFC | Fingerprint Sensor,True,True,True
4,Samsung Galaxy M35 5G,19999,,4.3 ★\n383 Ratings,4.3,383.0,Samsung Exynos 1380 Octa core,Samsung,8.0,50+8+2 MP Rear Camera,...,6.0,128.0,6000 mAh | 25W Fast Charging,6000,25,Android v14 OS,5G | NFC | Fingerprint Sensor,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,LG K8 2017,8999,,3.7 ★\n80 Ratings,3.7,80.0,MediaTek,MediaTek,,13 MP Rear Camera,...,2.0,16.0,2500 mAh,2500,,Android v6.0 OS,No add features,False,False,False
2876,Honor 6X 32GB,11999,8.3,4.5 ★\n35.9K Ratings,4.5,35900.0,HiSilicon Kirin Octa core,Other,8.0,12+2 MP Rear Camera,...,3.0,32.0,3340 mAh,3340,,Android v6.0.1 OS,Fingerprint Sensor,True,False,False
2877,Videocon Delite 21,6153,,3.0 ★\n772 Ratings,3.0,772.0,MediaTek Quad core,MediaTek,4.0,8 MP Rear Camera,...,2.0,16.0,3000 mAh,3000,,Android v6.0 OS,No add features,False,False,False
2878,Karbonn Titanium Vista 4G,5986,,3.8 ★\n195 Ratings,3.8,195.0,MediaTek Quad core,MediaTek,4.0,8 MP Rear Camera,...,1.0,8.0,2300 mAh,2300,,Android v6.0 OS,No add features,False,False,False


model_name - another col 'brand_name' can be created.

In [509]:
brand_name = df['model_name'].str.split(' ').str.get(0)

In [510]:
brand_name.value_counts()
# there is two issues:
# 1. I Kall is a brand but only I is fetched.
# 2. most Motorola phones has only moto word that have to handled separately.

model_name
Samsung      312
realme       210
vivo         195
Xiaomi       194
OPPO         130
            ... 
Lephone        1
Mobiistar      1
Comio          1
Elephone       1
Nextbit        1
Name: count, Length: 63, dtype: int64

In [511]:
brand_name = brand_name.str.replace('^I$', 'I Kall', regex=True).str.replace('^Moto$', 'Motorola', regex=True)

In [512]:
brand_name[brand_name.str.contains('Moto')].value_counts()

model_name
Motorola    94
Name: count, dtype: int64

In [513]:
df.insert(1, 'brand', brand_name)

In [514]:
df

Unnamed: 0,model_name,brand,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,...,ram,storage,battery,battery_capacity,charging,operating_system,additional_features,has_fingerprint,has_5g,has_nfc
0,vivo V40,vivo,34999,,0,,,Snapdragon 7 Gen 3 Octa core,Snapdragon,8.0,...,8.0,128.0,5500 mAh | 80W Fast Charging,5500,80,Android v14 OS,5G | Fingerprint Sensor,True,True,False
1,OPPO K12x,OPPO,12999,,0,,,MediaTek Dimensity 6300 Octa core,MediaTek,8.0,...,6.0,128.0,5100 mAh | 45W Fast Charging,5100,45,Android v14 OS,5G | Fingerprint Sensor,True,True,False
2,vivo V40 Pro,vivo,49999,8.2,0,,,MediaTek Dimensity 9200 Plus Octa core,MediaTek,8.0,...,8.0,256.0,5500 mAh | 80W Fast Charging,5500,80,Android v14 OS,5G | Fingerprint Sensor,True,True,False
3,Motorola Edge 50 Fusion,Motorola,24879,8.1,4.5 ★\n16.9K Ratings,4.5,16900.0,Snapdragon 7s Gen 2 Octa core,Snapdragon,8.0,...,8.0,128.0,5000 mAh | 68W Fast Charging,5000,68,Android v14 OS,5G | NFC | Fingerprint Sensor,True,True,True
4,Samsung Galaxy M35 5G,Samsung,19999,,4.3 ★\n383 Ratings,4.3,383.0,Samsung Exynos 1380 Octa core,Samsung,8.0,...,6.0,128.0,6000 mAh | 25W Fast Charging,6000,25,Android v14 OS,5G | NFC | Fingerprint Sensor,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,LG K8 2017,LG,8999,,3.7 ★\n80 Ratings,3.7,80.0,MediaTek,MediaTek,,...,2.0,16.0,2500 mAh,2500,,Android v6.0 OS,No add features,False,False,False
2876,Honor 6X 32GB,Honor,11999,8.3,4.5 ★\n35.9K Ratings,4.5,35900.0,HiSilicon Kirin Octa core,Other,8.0,...,3.0,32.0,3340 mAh,3340,,Android v6.0.1 OS,Fingerprint Sensor,True,False,False
2877,Videocon Delite 21,Videocon,6153,,3.0 ★\n772 Ratings,3.0,772.0,MediaTek Quad core,MediaTek,4.0,...,2.0,16.0,3000 mAh,3000,,Android v6.0 OS,No add features,False,False,False
2878,Karbonn Titanium Vista 4G,Karbonn,5986,,3.8 ★\n195 Ratings,3.8,195.0,MediaTek Quad core,MediaTek,4.0,...,1.0,8.0,2300 mAh,2300,,Android v6.0 OS,No add features,False,False,False


operating_system - split into two cols os_type, os_version

In [518]:
df['operating_system'].value_counts()

operating_system
Android v11 OS           294
Android v13 OS           290
Android v12 OS           238
Android v10 OS           211
Android v14 OS           210
                        ... 
Android v4.0.4 OS          1
Blackberry v10.2 OS        1
Blackberry v10.3.1 OS      1
Pragati OS OS              1
Sailfish OS v2.0 OS        1
Name: count, Length: 69, dtype: int64

In [534]:
os_type = df['operating_system'].str.split(' ').str.get(0)

In [537]:
os_type.value_counts()

operating_system
Android       2128
iOS             88
Blackberry       9
Windows          9
Symbian          9
Tizen            2
Pragati          1
KAI              1
Sailfish         1
Name: count, dtype: int64

In [538]:
os_type.value_counts().head(2).index.tolist()

['Android', 'iOS']

In [540]:
top_os = os_type.value_counts().head(2).index.tolist()
os_type = os_type.apply(lambda x: x if x in top_os else 'Other')

In [541]:
os_type.value_counts()

operating_system
Android    2128
iOS          88
Other        32
Name: count, dtype: int64

In [572]:
df.insert(27, 'os_type', os_type)

In [548]:
os_version = df['operating_system'].str.extract(r'(v[\d\.]+)')

In [573]:
df.insert(28, 'os_version', os_version)

In [577]:
df['os_version'].isna().sum()

12

In [578]:
df[df['os_version'].isna()]

Unnamed: 0,model_name,brand,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,...,battery,battery_capacity,charging,operating_system,os_type,os_version,additional_features,has_fingerprint,has_5g,has_nfc
432,Reliance JioPhone Next,Reliance,5599,7.0,"3.7 ★\n1,630 Ratings",3.7,1630.0,Snapdragon 215 Quad core,Snapdragon,4.0,...,3500 mAh,3500,,Pragati OS OS,Other,,No add features,False,False,False
2019,Oukitel WP20 Pro,Oukitel,16499,,4.4 ★\n30 Ratings,4.4,30.0,MediaTek Helio P22 Octa core,MediaTek,8.0,...,6300 mAh,6300,,Android OS,Android,,NFC | Fingerprint Sensor,True,False,True
2128,Nokia 8110 4G,Nokia,5999,7.0,"1.0 ★\n3,286 Ratings",1.0,3286.0,Snapdragon 205 Dual core,Snapdragon,2.0,...,1500 mAh,1500,,KAI OS OS,Other,,No add features,False,False,False
2280,Nokia C5-00 5MP,Nokia,6599,,2.7 ★\n25 Ratings,2.7,25.0,Dual Core,Other,2.0,...,1050 mAh Battery,1050,,Symbian OS,Other,,No add features,False,False,False
2406,Nokia 808 PureView,Nokia,34499,,3.9 ★\n574 Ratings,3.9,574.0,Dual Core,Other,2.0,...,1400 mAh Battery,1400,,Symbian OS,Other,,No add features,False,False,False
2420,Nokia C5,Nokia,6599,,4.0 ★\n247 Ratings,4.0,247.0,Single Core,Other,1.0,...,860 mAh Battery,860,,Symbian OS,Other,,No add features,False,False,False
2421,Nokia C6,Nokia,9655,,2.3 ★\n106 Ratings,2.3,106.0,Single Core,Other,1.0,...,1200 mAh Battery,1200,,Symbian OS,Other,,No add features,False,False,False
2422,Nokia E5,Nokia,8231,,5.0 ★\n366 Ratings,5.0,366.0,Single Core,Other,1.0,...,1200 mAh Battery,1200,,Symbian OS,Other,,No add features,False,False,False
2423,Nokia C5-03,Nokia,7920,,1.0 ★\n137 Ratings,1.0,137.0,Single Core,Other,1.0,...,1000 mAh Battery,1000,,Symbian OS,Other,,No add features,False,False,False
2424,Nokia E6,Nokia,16546,,4.8 ★\n189 Ratings,4.8,189.0,Single Core,Other,1.0,...,1500 mAh Battery,1500,,Symbian OS,Other,,No add features,False,False,False


In [580]:
df.loc[[432, 2019, 2128, 2280, 2406, 2420, 2421, 2422, 2423, 2424, 2425, 2426], 'os_version'] = ['v11', 'v12', 'v2.5.1', 'v9.3', 'v9.3', 'v9.3', 'v9.4', 'v9.3', 'v9.4', 'v9.3', 'v9.3', 'v9.4']

In [583]:
df['os_version'].isna().sum()

0

### Handling validity issues

price - has ',' between numbers. `validity`


In [587]:
df['price'] = df['price'].str.replace(',', '')

In [595]:
df.isna().sum()

model_name               0
brand                    0
price                    0
expert_rating          850
user_rating              0
avg_user_rating         69
num_ratings             69
processor                0
processor_brand          0
num_cores              175
rear_cameras             0
num_rear_cameras         0
main_rear_camera         0
front_cameras            0
num_front_cameras        0
main_front_camera        0
display                  0
display_size             0
refresh_rate             0
display_type             0
ram_internal_memory      0
ram                      0
storage                  0
battery                  0
battery_capacity         0
charging               927
operating_system         0
os_type                  0
os_version               0
additional_features      0
has_fingerprint          0
has_5g                   0
has_nfc                  0
dtype: int64

In [597]:
df[df['num_cores'].isna()]

Unnamed: 0,model_name,brand,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,...,battery,battery_capacity,charging,operating_system,os_type,os_version,additional_features,has_fingerprint,has_5g,has_nfc
915,Blackberry Classic,Blackberry,14999,,5.0 ★\n358 Ratings,5.0,358.0,Snapdragon S4 Plus,Snapdragon,,...,2515 mAh Battery,2515,,Blackberry v10.3.1 OS,Other,v10.3.1,No add features,False,False,False
940,XOLO Q3000,XOLO,15399,,4.3 ★\n833 Ratings,4.3,833.0,MediaTek,MediaTek,,...,4000 mAh Battery,4000,,Android v4.2 OS,Android,v4.2,No add features,False,False,False
1257,Blackberry Q5,Blackberry,20240,,"3.0 ★\n1,107 Ratings",3.0,1107.0,Snapdragon 400,Snapdragon,,...,2180 mAh Battery,2180,,Blackberry v10.2 OS,Other,v10.2,NFC,False,False,True
1410,Sony Ericsson Xperia Neo V,Sony,19299,,4.6 ★\n753 Ratings,4.6,753.0,Snapdragon S2,Snapdragon,,...,1500 mAh Battery,1500,,Android v2.3 OS,Android,v2.3,No add features,False,False,False
1414,Sony Xperia tipo,Sony,6299,,"4.8 ★\n2,293 Ratings",4.8,2293.0,Snapdragon,Snapdragon,,...,1500 mAh Battery,1500,,Android v4.0.4 OS,Android,v4.0.4,No add features,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2804,Panasonic Eluga Arc,Panasonic,8097,,3.5 ★\n335 Ratings,3.5,335.0,Snapdragon 410,Snapdragon,,...,1800 mAh,1800,,Android v5.1 OS,Android,v5.1,Fingerprint Sensor,True,False,False
2842,Lenovo A7700,Lenovo,7400,,"4.0 ★\n1,140 Ratings",4.0,1140.0,MediaTek,MediaTek,,...,2900 mAh,2900,,Android v6.0 OS,Android,v6.0,No add features,False,False,False
2845,Lenovo A6600 Plus,Lenovo,6099,6.8,"3.9 ★\n1,521 Ratings",3.9,1521.0,MediaTek,MediaTek,,...,2300 mAh,2300,,Android v6.0 OS,Android,v6.0,No add features,False,False,False
2857,Panasonic Eluga Tapp,Panasonic,6500,,"1.0 ★\n2,820 Ratings",1.0,2820.0,MediaTek,MediaTek,,...,2800 mAh,2800,,Android v6.0 OS,Android,v6.0,Fingerprint Sensor,True,False,False


In [606]:
num_cores = pd.to_numeric(df['num_cores'], errors='coerce')
num_cores.median()

8.0

In [607]:
num_cores.mode()

0    8.0
Name: num_cores, dtype: float64

On searching on internet, I found that some of them are quad core and some are octa core.

In [599]:
df[df['charging'].isna()]

Unnamed: 0,model_name,brand,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,...,battery,battery_capacity,charging,operating_system,os_type,os_version,additional_features,has_fingerprint,has_5g,has_nfc
261,OPPO A18,OPPO,8999,7.6,"4.3 ★\n1,585 Ratings",4.3,1585.0,MediaTek Helio G85 Octa core,MediaTek,8.0,...,5000 mAh,5000,,Android v13 OS,Android,v13,Fingerprint Sensor,True,False,False
355,I Kall S1,I Kall,9499,,2.6 ★\n48 Ratings,2.6,48.0,Unisoc Octa core,Unisoc,8.0,...,6000 mAh,6000,,Android v13 OS,Android,v13,Fingerprint Sensor,True,False,False
432,Reliance JioPhone Next,Reliance,5599,7.0,"3.7 ★\n1,630 Ratings",3.7,1630.0,Snapdragon 215 Quad core,Snapdragon,4.0,...,3500 mAh,3500,,Pragati OS OS,Other,v11,No add features,False,False,False
435,Tecno Spark 9,Tecno,6699,7.2,"3.9 ★\n7,802 Ratings",3.9,7802.0,MediaTek Helio G37 Octa core,MediaTek,8.0,...,5000 mAh,5000,,Android v12 OS,Android,v12,Fingerprint Sensor,True,False,False
475,OPPO A16,OPPO,10499,7.0,"4.3 ★\n5,055 Ratings",4.3,5055.0,MediaTek Helio G35 Octa core,MediaTek,8.0,...,5000 mAh,5000,,Android v11 OS,Android,v11,Fingerprint Sensor,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,LG K8 2017,LG,8999,,3.7 ★\n80 Ratings,3.7,80.0,MediaTek,MediaTek,,...,2500 mAh,2500,,Android v6.0 OS,Android,v6.0,No add features,False,False,False
2876,Honor 6X 32GB,Honor,11999,8.3,4.5 ★\n35.9K Ratings,4.5,35900.0,HiSilicon Kirin Octa core,Other,8.0,...,3340 mAh,3340,,Android v6.0.1 OS,Android,v6.0.1,Fingerprint Sensor,True,False,False
2877,Videocon Delite 21,Videocon,6153,,3.0 ★\n772 Ratings,3.0,772.0,MediaTek Quad core,MediaTek,4.0,...,3000 mAh,3000,,Android v6.0 OS,Android,v6.0,No add features,False,False,False
2878,Karbonn Titanium Vista 4G,Karbonn,5986,,3.8 ★\n195 Ratings,3.8,195.0,MediaTek Quad core,MediaTek,4.0,...,2300 mAh,2300,,Android v6.0 OS,Android,v6.0,No add features,False,False,False


In [604]:
charger_wattage = pd.to_numeric(df['charging'], errors='coerce')
charger_wattage.median()

25.0

In [605]:
charger_wattage.mode()
# lets have them missing

0    18.0
Name: charging, dtype: float64

#### Extract num_high_resolution_cameras
Before dropping cameras data, we can also extract important info that is the number of high_resoulation_cameras, which is generally the number of cameras which has MP higher or equal than the average of MP of all cameras

In [616]:
def handle_high_res_cameras(cameras):
    megapixels = [float(mp) for mp in re.findall(r'\d+\.?\d*', cameras)]
    mean_mp = np.mean(megapixels)
    num_high_resolution_cameras = sum(mp >= mean_mp for mp in megapixels)
    
    return num_high_resolution_cameras

num_high_resolution_cameras = df['rear_cameras'].map(handle_high_res_cameras)

In [620]:
df.insert(13, 'num_high_resolution_cameras', num_high_resolution_cameras)

In [623]:
df.head(10)

Unnamed: 0,model_name,brand,price,expert_rating,user_rating,avg_user_rating,num_ratings,processor,processor_brand,num_cores,rear_cameras,num_rear_cameras,main_rear_camera,num_high_resolution_cameras,front_cameras,num_front_cameras,main_front_camera,display,display_size,refresh_rate,display_type,ram_internal_memory,ram,storage,battery,battery_capacity,charging,operating_system,os_type,os_version,additional_features,has_fingerprint,has_5g,has_nfc
0,vivo V40,vivo,34999,,0,,,Snapdragon 7 Gen 3 Octa core,Snapdragon,8.0,50+50 MP Rear Camera,2,50.0,2,50 MP Front Camera,1,50.0,6.78″ (17.22 cm) 120Hz AMOLED Display,6.78,120Hz,AMOLED,8 GB RAM | 128 GB Storage,8.0,128.0,5500 mAh | 80W Fast Charging,5500,80,Android v14 OS,Android,v14,5G | Fingerprint Sensor,True,True,False
1,OPPO K12x,OPPO,12999,,0,,,MediaTek Dimensity 6300 Octa core,MediaTek,8.0,32+2 MP Rear Camera,2,32.0,1,8 MP Front Camera,1,8.0,6.67″ (16.94 cm) 120Hz LCD Display,6.67,120Hz,IPS LCD,6 GB RAM | 128 GB Storage,6.0,128.0,5100 mAh | 45W Fast Charging,5100,45,Android v14 OS,Android,v14,5G | Fingerprint Sensor,True,True,False
2,vivo V40 Pro,vivo,49999,8.2,0,,,MediaTek Dimensity 9200 Plus Octa core,MediaTek,8.0,50+50+50 MP Rear Camera,3,50.0,3,50 MP Front Camera,1,50.0,6.78″ (17.22 cm) 120Hz AMOLED Display,6.78,120Hz,AMOLED,8 GB RAM | 256 GB Storage,8.0,256.0,5500 mAh | 80W Fast Charging,5500,80,Android v14 OS,Android,v14,5G | Fingerprint Sensor,True,True,False
3,Motorola Edge 50 Fusion,Motorola,24879,8.1,4.5 ★\n16.9K Ratings,4.5,16900.0,Snapdragon 7s Gen 2 Octa core,Snapdragon,8.0,50+13 MP Rear Camera,2,50.0,1,32 MP Front Camera,1,32.0,6.67″ (16.94 cm) 144Hz P-OLED Display,6.67,144Hz,P-OLED,8 GB RAM | 128 GB Storage,8.0,128.0,5000 mAh | 68W Fast Charging,5000,68,Android v14 OS,Android,v14,5G | NFC | Fingerprint Sensor,True,True,True
4,Samsung Galaxy M35 5G,Samsung,19999,,4.3 ★\n383 Ratings,4.3,383.0,Samsung Exynos 1380 Octa core,Samsung,8.0,50+8+2 MP Rear Camera,3,50.0,1,13 MP Front Camera,1,13.0,6.6″ (16.76 cm) 120Hz Super AMOLED Display,6.6,120Hz,Super AMOLED,6 GB RAM | 128 GB Storage,6.0,128.0,6000 mAh | 25W Fast Charging,6000,25,Android v14 OS,Android,v14,5G | NFC | Fingerprint Sensor,True,True,True
5,OnePlus Nord CE 4 5G,OnePlus,24998,8.2,"4.2 ★\n3,455 Ratings",4.2,3455.0,Snapdragon 7 Gen 3 Octa core,Snapdragon,8.0,50+8 MP Rear Camera,2,50.0,1,16 MP Front Camera,1,16.0,6.7″ (17.02 cm) 120Hz AMOLED Display,6.7,120Hz,AMOLED,8 GB RAM | 128 GB Storage,8.0,128.0,5500 mAh | 100W Fast Charging,5500,100,Android v14 OS,Android,v14,5G | Fingerprint Sensor,True,True,False
6,Moto G85,Motorola,18880,,0,,,Snapdragon 6s Gen 3 Octa core,Snapdragon,8.0,50+8 MP Rear Camera,2,50.0,1,32 MP Front Camera,1,32.0,6.67″ (16.94 cm) 120Hz P-OLED Display,6.67,120Hz,P-OLED,8 GB RAM | 128 GB Storage,8.0,128.0,5000 mAh | 33W Fast Charging,5000,33,Android v14 OS,Android,v14,5G | Fingerprint Sensor,True,True,False
7,OnePlus Nord 4,OnePlus,29998,,0,,,Snapdragon 7 Plus Gen 3 Octa core,Snapdragon,8.0,50+8 MP Rear Camera,2,50.0,1,16 MP Front Camera,1,16.0,6.74″ (17.12 cm) 120Hz AMOLED Display,6.74,120Hz,AMOLED,8 GB RAM | 128 GB Storage,8.0,128.0,5500 mAh | 100W Fast Charging,5500,100,Android v14 OS,Android,v14,5G | NFC | Fingerprint Sensor,True,True,True
8,Motorola Edge 50 Pro 5G,Motorola,29495,8.4,4.4 ★\n17.5K Ratings,4.4,17500.0,Snapdragon 7 Gen 3 Octa core,Snapdragon,8.0,50+13+10 MP Rear Camera,3,50.0,1,50 MP Front Camera,1,50.0,6.7″ (17.02 cm) 144Hz P-OLED Display,6.7,144Hz,P-OLED,8 GB RAM | 256 GB Storage,8.0,256.0,4500 mAh | 125W Fast Charging,4500,125,Android v14 OS,Android,v14,5G | NFC | Fingerprint Sensor,True,True,True
9,vivo V30,vivo,27999,8.2,"4.5 ★\n4,803 Ratings",4.5,4803.0,Snapdragon 7 Gen 3 Octa core,Snapdragon,8.0,50+50 MP Rear Camera,2,50.0,2,50 MP Front Camera,1,50.0,6.78″ (17.22 cm) 120Hz AMOLED Display,6.78,120Hz,AMOLED,8 GB RAM | 128 GB Storage,8.0,128.0,5000 mAh | 80W Fast Charging,5000,80,Android v14 OS,Android,v14,5G | Fingerprint Sensor,True,True,False


In [622]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [624]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2248 entries, 0 to 2879
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   model_name                   2248 non-null   object 
 1   brand                        2248 non-null   object 
 2   price                        2248 non-null   object 
 3   expert_rating                1398 non-null   float64
 4   user_rating                  2248 non-null   object 
 5   avg_user_rating              2179 non-null   object 
 6   num_ratings                  2179 non-null   float64
 7   processor                    2248 non-null   object 
 8   processor_brand              2248 non-null   object 
 9   num_cores                    2073 non-null   float64
 10  rear_cameras                 2248 non-null   object 
 11  num_rear_cameras             2248 non-null   int64  
 12  main_rear_camera             2248 non-null   float64
 13  num_high_resolution_cam

dropping columns which no further required. and saving into csv file for further analysis

In [625]:
df.columns

Index(['model_name', 'brand', 'price', 'expert_rating', 'user_rating',
       'avg_user_rating', 'num_ratings', 'processor', 'processor_brand',
       'num_cores', 'rear_cameras', 'num_rear_cameras', 'main_rear_camera',
       'num_high_resolution_cameras', 'front_cameras', 'num_front_cameras',
       'main_front_camera', 'display', 'display_size', 'refresh_rate',
       'display_type', 'ram_internal_memory', 'ram', 'storage', 'battery',
       'battery_capacity', 'charging', 'operating_system', 'os_type',
       'os_version', 'additional_features', 'has_fingerprint', 'has_5g',
       'has_nfc'],
      dtype='object')

In [627]:
df.to_csv('smartphones_cleaned_V1.csv', index=False)

In [629]:
df.drop(columns=['user_rating', 'rear_cameras', 'front_cameras', 'display', 'ram_internal_memory', 'battery', 'operating_system', 'additional_features'], inplace=True)

In [630]:
df.to_csv('smartphones_cleaned_V2.csv', index=False)