## Data Analysis

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("fdm_dataset.csv")
data.head()

Unnamed: 0,Phone_ID,Brand,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($)
0,1,Oppo A17,4.3,4.0,64.0,6.4,64.0,48.0,6500.0,649
1,2,Vivo Y81,3.4,4.0,48.0,5.8,64.0,48.0,5300.0,750
2,3,Redmi A9,4.3,4.0,64.0,6.1,48.0,32.0,4000.0,449
3,4,Samsung Galaxy A50,4.4,4.0,64.0,6.4,48.0,32.0,5500.0,609
4,5,Nokia 415,4.5,2.0,24.0,3.7,16.0,12.0,3000.0,249


In [3]:
data.shape

(1365, 10)

In [4]:
# Check for duplicated values
data.duplicated().sum()
# Remove Duplicate rows
# data = data.drop_duplicates()

0

In [5]:
data.isnull().sum()

Phone_ID           0
Brand              0
Ratings           45
RAM              101
ROM               90
Mobile_Size       94
Primary_Cam      109
Selfi_Cam        101
Battery_Power    117
Price($)           0
dtype: int64

In [6]:
# Details about dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1365 entries, 0 to 1364
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Phone_ID       1365 non-null   int64  
 1   Brand          1365 non-null   object 
 2   Ratings        1320 non-null   float64
 3   RAM            1264 non-null   float64
 4   ROM            1275 non-null   float64
 5   Mobile_Size    1271 non-null   float64
 6   Primary_Cam    1256 non-null   float64
 7   Selfi_Cam      1264 non-null   float64
 8   Battery_Power  1248 non-null   float64
 9   Price($)       1365 non-null   int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 106.8+ KB


In [7]:
# Most related columns to price column
data.corr(numeric_only = True)["Price($)"]

Phone_ID         0.139471
Ratings         -0.026222
RAM              0.529272
ROM              0.613654
Mobile_Size      0.822939
Primary_Cam      0.814120
Selfi_Cam        0.711292
Battery_Power    0.872684
Price($)         1.000000
Name: Price($), dtype: float64

In [8]:
data['Brand'].value_counts()

Brand
  Nokia  415              129
  Oppo  A9                126
  Nokia  216              124
  Redmi A9                122
  Vivo  Y81               119
Oppo  A17                 111
  Vivo  V21               108
Redmi K20                  94
  Samsung  Galaxy A50      90
  Apple iPhone 11 Pro      75
  Samsung  Galaxy A70      75
  Apple iPhone 12          72
  Samsung  Galaxy S20+     64
  Apple iPhone 11          56
Name: count, dtype: int64

In [9]:
# Number of columns
len(data['Brand'].value_counts())

14

In [10]:
data.describe()

Unnamed: 0,Phone_ID,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($)
count,1365.0,1320.0,1264.0,1275.0,1271.0,1256.0,1264.0,1248.0,1365.0
mean,683.0,4.095985,5.085443,85.697255,5.711015,46.535032,32.591772,5119.711538,650.310623
std,394.18587,0.369362,1.826776,51.193289,1.059151,17.864841,13.90994,1362.969745,235.74502
min,1.0,2.8,2.0,16.0,3.4,12.0,8.0,2500.0,149.0
25%,342.0,3.8,4.0,64.0,5.8,48.0,24.0,4000.0,499.0
50%,683.0,4.1,4.0,64.0,6.1,48.0,32.0,5500.0,719.0
75%,1024.0,4.4,6.0,128.0,6.4,64.0,48.0,6500.0,849.0
max,1365.0,4.8,8.0,256.0,6.5,64.0,48.0,7000.0,999.0


In [11]:
data.groupby('Brand')['RAM'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Apple iPhone 11,52.0,4.538462,0.895775,4.0,4.0,4.0,6.0,6.0
Apple iPhone 11 Pro,64.0,4.53125,0.890314,4.0,4.0,4.0,6.0,6.0
Apple iPhone 12,64.0,5.125,1.0,4.0,4.0,6.0,6.0,6.0
Nokia 216,118.0,3.355932,0.938497,2.0,2.0,4.0,4.0,4.0
Nokia 415,122.0,3.229508,1.074012,2.0,2.0,4.0,4.0,8.0
Oppo A9,118.0,6.779661,1.849627,4.0,4.0,8.0,8.0,8.0
Redmi A9,117.0,5.316239,0.95276,4.0,4.0,6.0,6.0,6.0
Samsung Galaxy A50,84.0,5.857143,2.006873,4.0,4.0,4.0,8.0,8.0
Samsung Galaxy A70,69.0,4.869565,1.661971,4.0,4.0,4.0,4.0,8.0
Samsung Galaxy S20+,57.0,6.035088,2.017468,4.0,4.0,8.0,8.0,8.0


In [12]:
# Fill null values in the 'RAM' column with the mean 'RAM' value for the corresponding 'Brand' group
# data['RAM'] = data['RAM'].fillna(data.groupby('Brand')['RAM'].transform('mean'))
data['RAM'] = data['RAM'].fillna(data['RAM'].mean())

In [13]:
data

Unnamed: 0,Phone_ID,Brand,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($)
0,1,Oppo A17,4.3,4.0,64.0,6.4,64.0,48.0,6500.0,649
1,2,Vivo Y81,3.4,4.0,48.0,5.8,64.0,48.0,5300.0,750
2,3,Redmi A9,4.3,4.0,64.0,6.1,48.0,32.0,4000.0,449
3,4,Samsung Galaxy A50,4.4,4.0,64.0,6.4,48.0,32.0,5500.0,609
4,5,Nokia 415,4.5,2.0,24.0,3.7,16.0,12.0,3000.0,249
...,...,...,...,...,...,...,...,...,...,...
1360,1361,Apple iPhone 12,3.5,6.0,,6.2,36.0,,,849
1361,1362,Apple iPhone 11,4.3,6.0,256.0,,48.0,24.0,5500.0,999
1362,1363,Nokia 415,4.3,4.0,32.0,3.7,16.0,12.0,3000.0,299
1363,1364,Nokia 216,3.8,4.0,32.0,3.4,12.0,8.0,2500.0,199


In [14]:
data.isnull().sum()

Phone_ID           0
Brand              0
Ratings           45
RAM                0
ROM               90
Mobile_Size       94
Primary_Cam      109
Selfi_Cam        101
Battery_Power    117
Price($)           0
dtype: int64

In [15]:
# Fill null values in the 'Mobile_Size' column with the mean 'Mobile_Size' value for the corresponding 'Brand' group
data['Mobile_Size'] = data['Mobile_Size'].fillna(data['Mobile_Size'].mean())

In [16]:
# Fill null values in the 'Battery_Power' column with the mean 'Battery_Power' value for the corresponding 'Brand' group
data['Battery_Power'] = data['Battery_Power'].fillna(data['Battery_Power'].mean())

In [17]:
# fill null values in the 'ROM' column based on the mean 'ROM' value for each unique combination of 'Brand' and 'ROM'
data['ROM'] = data['ROM'].fillna(data['ROM'].mean())

In [18]:
# Fill null values in the 'Primary_Cam' column with the mean 'Primary_Cam' value for the corresponding 'Brand' group
data['Primary_Cam'] = data['Primary_Cam'].fillna(data['Primary_Cam'].mean())

In [19]:
# Fill null values in the 'Selfi_Cam' column with the mean 'Selfi_Cam' value for the corresponding 'Brand' group
data['Selfi_Cam'] = data['Selfi_Cam'].fillna(data['Selfi_Cam'].mean())

In [20]:
data.isnull().sum()

Phone_ID          0
Brand             0
Ratings          45
RAM               0
ROM               0
Mobile_Size       0
Primary_Cam       0
Selfi_Cam         0
Battery_Power     0
Price($)          0
dtype: int64

In [21]:
# data['Company'] = data['Brand'].apply(lambda x:" ".join(x.split()[0:1]))
data['Company'] = data['Brand'].apply(lambda x:x.split()[0])

In [22]:
data['Company'].value_counts()

Company
Nokia      253
Oppo       237
Samsung    229
Vivo       227
Redmi      216
Apple      203
Name: count, dtype: int64

In [23]:
data.head()

Unnamed: 0,Phone_ID,Brand,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company
0,1,Oppo A17,4.3,4.0,64.0,6.4,64.0,48.0,6500.0,649,Oppo
1,2,Vivo Y81,3.4,4.0,48.0,5.8,64.0,48.0,5300.0,750,Vivo
2,3,Redmi A9,4.3,4.0,64.0,6.1,48.0,32.0,4000.0,449,Redmi
3,4,Samsung Galaxy A50,4.4,4.0,64.0,6.4,48.0,32.0,5500.0,609,Samsung
4,5,Nokia 415,4.5,2.0,24.0,3.7,16.0,12.0,3000.0,249,Nokia


In [24]:
data = data.drop(columns=['Ratings'])

In [25]:
data = data.drop(columns=['Phone_ID'])

In [26]:
data.head()

Unnamed: 0,Brand,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company
0,Oppo A17,4.0,64.0,6.4,64.0,48.0,6500.0,649,Oppo
1,Vivo Y81,4.0,48.0,5.8,64.0,48.0,5300.0,750,Vivo
2,Redmi A9,4.0,64.0,6.1,48.0,32.0,4000.0,449,Redmi
3,Samsung Galaxy A50,4.0,64.0,6.4,48.0,32.0,5500.0,609,Samsung
4,Nokia 415,2.0,24.0,3.7,16.0,12.0,3000.0,249,Nokia


In [27]:
my_data = data.drop(columns=['Brand'])

In [28]:
data.head()

Unnamed: 0,Brand,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company
0,Oppo A17,4.0,64.0,6.4,64.0,48.0,6500.0,649,Oppo
1,Vivo Y81,4.0,48.0,5.8,64.0,48.0,5300.0,750,Vivo
2,Redmi A9,4.0,64.0,6.1,48.0,32.0,4000.0,449,Redmi
3,Samsung Galaxy A50,4.0,64.0,6.4,48.0,32.0,5500.0,609,Samsung
4,Nokia 415,2.0,24.0,3.7,16.0,12.0,3000.0,249,Nokia


In [29]:
my_data.head()

Unnamed: 0,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company
0,4.0,64.0,6.4,64.0,48.0,6500.0,649,Oppo
1,4.0,48.0,5.8,64.0,48.0,5300.0,750,Vivo
2,4.0,64.0,6.1,48.0,32.0,4000.0,449,Redmi
3,4.0,64.0,6.4,48.0,32.0,5500.0,609,Samsung
4,2.0,24.0,3.7,16.0,12.0,3000.0,249,Nokia


In [30]:
# one hot encoding
my_data = pd.get_dummies(my_data)

In [31]:
my_data.head()

Unnamed: 0,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company_Apple,Company_Nokia,Company_Oppo,Company_Redmi,Company_Samsung,Company_Vivo
0,4.0,64.0,6.4,64.0,48.0,6500.0,649,False,False,True,False,False,False
1,4.0,48.0,5.8,64.0,48.0,5300.0,750,False,False,False,False,False,True
2,4.0,64.0,6.1,48.0,32.0,4000.0,449,False,False,False,True,False,False
3,4.0,64.0,6.4,48.0,32.0,5500.0,609,False,False,False,False,True,False
4,2.0,24.0,3.7,16.0,12.0,3000.0,249,False,True,False,False,False,False


In [32]:
my_data.shape

(1365, 13)

## Model Building

In [33]:
# split data set into x and y
X = my_data.drop('Price($)', axis=1)
y = my_data['Price($)']

In [34]:
!pip install sklearn




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
# split data sets into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [36]:
x_train.shape, x_test.shape

((1023, 12), (342, 12))

In [37]:
# Create a function to find accuray of a given model (with default hyper parameters)
def model_acc(model):
    model.fit(x_train, y_train)
    acc = model.score(x_test, y_test)
    print(str(model) + '-->' + str(acc))

In [38]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model_acc(lr)

from sklearn.linear_model import Lasso
lasso = Lasso()
model_acc(lasso)

from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
model_acc(dt)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
model_acc(rf)

LinearRegression()-->0.926711674247532
Lasso()-->0.9255865544132236
DecisionTreeRegressor()-->0.9889885786420793
RandomForestRegressor()-->0.9925556495242878


In [39]:
# Hyper parameter tunning
from sklearn.model_selection import GridSearchCV
# parameters = {
#     'criterion': ['mse', 'friedman_mse', 'mae'],  # You can adjust these criteria
#     'splitter': ['best', 'random'],  # Splitting strategy
#     'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
# }
parameters = {
        'criterion':['squared_error','absolute_error','poisson'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'splitter': ['best', 'random'],
}

# grid_obj = GridSearchCV(estimator = dt, param_grid= parameters, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_obj = GridSearchCV(estimator = dt, param_grid= parameters)

grid_fit = grid_obj.fit(x_train, y_train)
best_params = grid_fit.best_params_
my_model = grid_fit.best_estimator_

print(best_params)
print(my_model)

{'criterion': 'squared_error', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
DecisionTreeRegressor(max_depth=20, splitter='random')


In [40]:
# get accuracy when we train the model by best model
my_model.score(x_train, y_train)

0.9986292430355115

In [41]:
# save the model using pickle
import pickle
with open("my_predictor.pickle", 'wb') as file:
    pickle.dump(my_model, file)

In [42]:
x_train.columns

Index(['RAM', 'ROM', 'Mobile_Size', 'Primary_Cam', 'Selfi_Cam',
       'Battery_Power', 'Company_Apple', 'Company_Nokia', 'Company_Oppo',
       'Company_Redmi', 'Company_Samsung', 'Company_Vivo'],
      dtype='object')

In [43]:
# get a prediction using my model
my_model.predict([[4, 64, 6.1, 64, 48, 4000, 0, 0, 1, 0, 0, 0]])



array([649.])

In [44]:
my_model.predict([[4, 64, 6.1, 64, 48, 4000, 1, 0, 0, 0, 0, 0]])



array([899.])