For implementation of ML models, importing libraries is essential. Libraries have been imported here.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import *
import sklearn.preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

## Exploratory Data Analysis

In [3]:
df = pd.read_csv('C:\Scripts\Visual Studio\Intel Internship\MobPrice.csv')
df.head()

Unnamed: 0,Brand,Model,Storage,RAM,Screen Size (inches),Camera (MP),Battery Capacity (mAh),Price ($)
0,Apple,iPhone 13 Pro,128 GB,6 GB,6.1,12 + 12 + 12,3095,999
1,Samsung,Galaxy S21 Ultra,256 GB,12 GB,6.8,108 + 10 + 10 + 12,5000,1199
2,OnePlus,9 Pro,128 GB,8 GB,6.7,48 + 50 + 8 + 2,4500,899
3,Xiaomi,Redmi Note 10 Pro,128 GB,6 GB,6.67,64 + 8 + 5 + 2,5020,279
4,Google,Pixel 6,128 GB,8 GB,6.4,50 + 12.2,4614,799


In [4]:
df['Screen Size (inches)'].unique()

array(['6.1', '6.8', '6.7', '6.67', '6.4', '6.55', '6.78', '6.43', '6.5',
       '6.62', '5.4', '6.2', '6.51', '6.6', '4.7', '6.58', '6.52', '6.44',
       '6.53', '6.56', '6.8 + 3.9', '4.5', '6.39', '5.9', '5.5', '6.81',
       '5.99', '6.82', '6.3', '6.22', '6', '6.35', '6.9', '6.76', '6.49',
       '6.72', '5.7', '6.47', '7.6 (unfolded)', '6.15', '6.57'],
      dtype=object)

In [5]:
df.columns

Index(['Brand', 'Model', 'Storage ', 'RAM ', 'Screen Size (inches)',
       'Camera (MP)', 'Battery Capacity (mAh)', 'Price ($)'],
      dtype='object')

In [6]:
df.dtypes

Brand                     object
Model                     object
Storage                   object
RAM                       object
Screen Size (inches)      object
Camera (MP)               object
Battery Capacity (mAh)     int64
Price ($)                 object
dtype: object

In [7]:
df.isnull().sum()

Brand                     0
Model                     0
Storage                   0
RAM                       0
Screen Size (inches)      0
Camera (MP)               0
Battery Capacity (mAh)    0
Price ($)                 0
dtype: int64

## Preprocessing

In [8]:
df['Foldable'] = ''

for k in range (len(df['Screen Size (inches)'])):
    if len(df['Screen Size (inches)'][k]) < 5:
        df['Foldable'][k] = 'No'
    else:
        df['Foldable'][k] = 'Yes'

In [9]:
df['Foldable'].value_counts()

Foldable
No     405
Yes      2
Name: count, dtype: int64

In [10]:
for k in range (len(df['Price ($)'])):
  try:
    df['Price ($)'][k] = int(df['Price ($)'][k])
  except:
    df['Price ($)'][k] = df['Price ($)'][k].replace(",","")
    df['Price ($)'][k] = df['Price ($)'][k].replace("$","")
    df['Price ($)'][k] = int(df['Price ($)'][k])

for k in range(df.shape[0]):
  df['Storage '][k] = df['Storage '][k].replace("GB","")
  df['Storage '][k] = df['Storage '][k].replace(" ","")

for k in range(df.shape[0]):
  df['RAM '][k] = df['RAM '][k].replace("GB","")
  df['RAM '][k] = df['RAM '][k].replace(" ","")


In [11]:
le = LabelEncoder()
df['Brand'] = le.fit_transform(df['Brand'])
df['Foldable'] = le.fit_transform(df['Foldable'])
df['Camera (MP)'] = le.fit_transform(df['Camera (MP)'])

In [12]:
df

Unnamed: 0,Brand,Model,Storage,RAM,Screen Size (inches),Camera (MP),Battery Capacity (mAh),Price ($),Foldable
0,0,iPhone 13 Pro,128,6,6.1,15,3095,999,0
1,12,Galaxy S21 Ultra,256,12,6.8,0,5000,1199,0
2,9,9 Pro,128,8,6.7,53,4500,899,0
3,15,Redmi Note 10 Pro,128,6,6.67,119,5020,279,0
4,4,Pixel 6,128,8,6.4,87,4614,799,0
...,...,...,...,...,...,...,...,...,...
402,12,Galaxy Note20 5G,128,8,6.7,19,4300,1049,0
403,15,Mi 10 Lite 5G,128,6,6.57,70,4160,349,0
404,0,iPhone 12 Pro Max,128,6,6.7,17,3687,1099,0
405,10,Reno3,128,8,6.4,59,4025,429,0


In [13]:
df.isnull().sum()

Brand                     0
Model                     0
Storage                   0
RAM                       0
Screen Size (inches)      0
Camera (MP)               0
Battery Capacity (mAh)    0
Price ($)                 0
Foldable                  0
dtype: int64

In [14]:
for i in range(df.shape[0]):
    df['Screen Size (inches)'][i] = df['Screen Size (inches)'][i][:3]

In [15]:
df['Screen Size (inches)'].unique()

array(['6.1', '6.8', '6.7', '6.6', '6.4', '6.5', '5.4', '6.2', '4.7',
       '4.5', '6.3', '5.9', '5.5', '6', '6.9', '5.7', '7.6'], dtype=object)

In [16]:
df

Unnamed: 0,Brand,Model,Storage,RAM,Screen Size (inches),Camera (MP),Battery Capacity (mAh),Price ($),Foldable
0,0,iPhone 13 Pro,128,6,6.1,15,3095,999,0
1,12,Galaxy S21 Ultra,256,12,6.8,0,5000,1199,0
2,9,9 Pro,128,8,6.7,53,4500,899,0
3,15,Redmi Note 10 Pro,128,6,6.6,119,5020,279,0
4,4,Pixel 6,128,8,6.4,87,4614,799,0
...,...,...,...,...,...,...,...,...,...
402,12,Galaxy Note20 5G,128,8,6.7,19,4300,1049,0
403,15,Mi 10 Lite 5G,128,6,6.5,70,4160,349,0
404,0,iPhone 12 Pro Max,128,6,6.7,17,3687,1099,0
405,10,Reno3,128,8,6.4,59,4025,429,0


In [17]:
df['Storage '] = df['Storage '].astype(int)


In [18]:
df['Price ($)'] = df['Price ($)'].astype(int)


In [19]:
df['Screen Size (inches)'] = df['Screen Size (inches)'].astype(float)


In [20]:
df['RAM '] = df['RAM '].astype(int)


In [21]:
df.dtypes

Brand                       int32
Model                      object
Storage                     int32
RAM                         int32
Screen Size (inches)      float64
Camera (MP)                 int32
Battery Capacity (mAh)      int64
Price ($)                   int32
Foldable                    int32
dtype: object

In [22]:
cols = ['Brand', 'Model', 'Storage ', 'RAM ', 'Screen Size (inches)',
       'Camera (MP)', 'Battery Capacity (mAh)', 'Foldable', 'Price ($)']
df = df[cols]

In [23]:
df

Unnamed: 0,Brand,Model,Storage,RAM,Screen Size (inches),Camera (MP),Battery Capacity (mAh),Foldable,Price ($)
0,0,iPhone 13 Pro,128,6,6.1,15,3095,0,999
1,12,Galaxy S21 Ultra,256,12,6.8,0,5000,0,1199
2,9,9 Pro,128,8,6.7,53,4500,0,899
3,15,Redmi Note 10 Pro,128,6,6.6,119,5020,0,279
4,4,Pixel 6,128,8,6.4,87,4614,0,799
...,...,...,...,...,...,...,...,...,...
402,12,Galaxy Note20 5G,128,8,6.7,19,4300,0,1049
403,15,Mi 10 Lite 5G,128,6,6.5,70,4160,0,349
404,0,iPhone 12 Pro Max,128,6,6.7,17,3687,0,1099
405,10,Reno3,128,8,6.4,59,4025,0,429


We have now cleaned the dataset and it is now ready to be used for creation of ML models to predict the price of smartphones.

## Type 1: MULTIPLE LINEAR REGRESSION

In [24]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]


In [25]:
x = x.drop(['Model'], axis=1)
x


Unnamed: 0,Brand,Storage,RAM,Screen Size (inches),Camera (MP),Battery Capacity (mAh),Foldable
0,0,128,6,6.1,15,3095,0
1,12,256,12,6.8,0,5000,0
2,9,128,8,6.7,53,4500,0
3,15,128,6,6.6,119,5020,0
4,4,128,8,6.4,87,4614,0
...,...,...,...,...,...,...,...
402,12,128,8,6.7,19,4300,0
403,15,128,6,6.5,70,4160,0
404,0,128,6,6.7,17,3687,0
405,10,128,8,6.4,59,4025,0


In [26]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)

In [27]:
df.head()

Unnamed: 0,Brand,Model,Storage,RAM,Screen Size (inches),Camera (MP),Battery Capacity (mAh),Foldable,Price ($)
0,0,iPhone 13 Pro,128,6,6.1,15,3095,0,999
1,12,Galaxy S21 Ultra,256,12,6.8,0,5000,0,1199
2,9,9 Pro,128,8,6.7,53,4500,0,899
3,15,Redmi Note 10 Pro,128,6,6.6,119,5020,0,279
4,4,Pixel 6,128,8,6.4,87,4614,0,799


In [28]:
# scaler = MinMaxScaler()
# df_norm = pd.DataFrame(scaler.fit_transform(), columns=cols)

In [29]:
from sklearn.linear_model import LinearRegression  
regressor = LinearRegression()  
regressor.fit(x_train, y_train)

In [30]:
y_pred = regressor.predict(x_test)  

A simple multiple linear regression model has been created and the model scores are as follows:

In [31]:
print('Train Score: ', regressor.score(x_train, y_train))  
print('Test Score: ', regressor.score(x_test, y_test))  

Train Score:  0.7785007177581473
Test Score:  0.7227147863966661


## Type 2: K-NEAREST NEIGHBORS

In [32]:
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=10, test_size=0.2)
x_train.head()

Unnamed: 0,Brand,Storage,RAM,Screen Size (inches),Camera (MP),Battery Capacity (mAh),Foldable
27,10,128,6,6.5,48,5000,0
279,11,64,4,6.5,40,6000,0
105,10,32,3,6.5,35,5000,0
358,15,128,4,6.5,70,6000,0
349,7,64,4,6.4,43,5000,0


In [33]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3,leaf_size=25)
knn.fit(x_train,y_train)
y_pred_knn=knn.predict(x_test)

In [34]:
y_pred_knn

array([ 205.66666667,  999.        ,  202.33333333,  349.        ,
        292.33333333,  182.33333333,  265.66666667,  932.33333333,
        162.33333333,  482.33333333,  349.        ,  895.66666667,
        622.33333333,  145.66666667,  482.33333333,  365.66666667,
        399.        ,  129.        ,  289.        ,  309.        ,
        365.66666667,  765.66666667,  179.        ,  309.        ,
        132.33333333,  185.66666667,  185.66666667,  152.33333333,
        549.        ,  392.33333333,  265.66666667,  179.        ,
        195.66666667,  129.        ,  349.        ,  559.        ,
        259.        ,  122.33333333,  575.66666667,  202.33333333,
        282.33333333,  532.33333333,  399.        ,  195.66666667,
        349.        ,  179.        ,  199.        ,  172.33333333,
        515.66666667,  309.        ,  145.66666667,  435.66666667,
        799.        ,  282.33333333,  382.33333333,  159.        ,
        622.33333333,  499.        ,  559.        ,  799.     

In [35]:
score = r2_score(y_test,y_pred_knn)
print ("The R2 score is:",score*100)

The R2 score is: 83.36826483312514


In [36]:
import pickle
with open('Modek_KNN','wb') as f:
    pickle.dump(knn,f)

The models have been implemented with decent accuracy scores for the prediction of smartphone pricing depending upon the speicifcations of the particular smartphones. 

### Thank you