## Data Analysis

In [46]:
import numpy as np
import pandas as pd

In [47]:
data = pd.read_csv("fdm_dataset.csv")
data.head()

Unnamed: 0,Brand,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($)
0,Oppo A17,4.3,4.0,64.0,6.4,64.0,48.0,6500.0,649
1,Vivo Y81,3.4,4.0,48.0,5.8,64.0,48.0,5300.0,750
2,Redmi A9,4.3,4.0,64.0,6.1,48.0,32.0,4000.0,449
3,Samsung Galaxy A50,4.4,4.0,64.0,6.4,48.0,32.0,5500.0,609
4,Nokia 415,4.5,2.0,24.0,3.7,16.0,12.0,3000.0,249


In [48]:
data.shape

(1365, 9)

In [49]:
data.isnull().sum()

Brand              0
Ratings           45
RAM              101
ROM               90
Mobile_Size       94
Primary_Cam      109
Selfi_Cam        101
Battery_Power    117
Price($)           0
dtype: int64

In [50]:
# Remove Duplicate rows
data = data.drop_duplicates()
data.shape

(799, 9)

In [51]:
data.isnull().sum()

Brand              0
Ratings           29
RAM               95
ROM               86
Mobile_Size       86
Primary_Cam      104
Selfi_Cam         95
Battery_Power    104
Price($)           0
dtype: int64

In [52]:
# Details about dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 799 entries, 0 to 1361
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Brand          799 non-null    object 
 1   Ratings        770 non-null    float64
 2   RAM            704 non-null    float64
 3   ROM            713 non-null    float64
 4   Mobile_Size    713 non-null    float64
 5   Primary_Cam    695 non-null    float64
 6   Selfi_Cam      704 non-null    float64
 7   Battery_Power  695 non-null    float64
 8   Price($)       799 non-null    int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 62.4+ KB


In [53]:
# Most related columns to price column
data.corr(numeric_only = True)["Price($)"]

Ratings         -0.033362
RAM              0.500812
ROM              0.597054
Mobile_Size      0.792954
Primary_Cam      0.781875
Selfi_Cam        0.641601
Battery_Power    0.848447
Price($)         1.000000
Name: Price($), dtype: float64

In [54]:
data['Brand'].value_counts()

Brand
  Vivo  V21               72
  Vivo  Y81               68
  Redmi A9                64
  Nokia  415              64
  Oppo  A17               63
Redmi K20                 62
  Oppo  A9                60
  Nokia  216              57
  Samsung  Galaxy A50     53
  Apple iPhone 11 Pro     52
  Samsung  Galaxy A70     51
  Apple iPhone 12         45
  Apple iPhone 11         44
  Samsung  Galaxy S20+    44
Name: count, dtype: int64

In [10]:
# Number of columns
len(data['Brand'].value_counts())

14

In [11]:
data.describe()

Unnamed: 0,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($)
count,770.0,704.0,713.0,713.0,695.0,704.0,695.0,799.0
mean,4.073247,5.082386,89.402525,5.808135,47.539568,33.255682,5237.985612,675.405507
std,0.387967,1.833473,54.297628,0.973232,16.679236,13.175799,1279.1597,227.329116
min,2.8,2.0,16.0,3.4,12.0,8.0,2500.0,149.0
25%,3.8,4.0,64.0,6.0,48.0,24.0,5000.0,609.0
50%,4.1,4.0,64.0,6.1,48.0,32.0,5500.0,749.0
75%,4.4,6.0,128.0,6.4,64.0,48.0,6500.0,849.0
max,4.8,8.0,256.0,6.5,64.0,48.0,7000.0,999.0


In [12]:
data.groupby('Brand')['RAM'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Apple iPhone 11,40.0,4.65,0.948683,4.0,4.0,4.0,6.0,6.0
Apple iPhone 11 Pro,41.0,4.536585,0.89715,4.0,4.0,4.0,6.0,6.0
Apple iPhone 12,38.0,5.0,1.013423,4.0,4.0,5.0,6.0,6.0
Nokia 216,51.0,3.215686,0.986179,2.0,2.0,4.0,4.0,4.0
Nokia 415,59.0,3.084746,1.193201,2.0,2.0,4.0,4.0,8.0
Oppo A17,52.0,5.307692,1.894662,4.0,4.0,4.0,8.0,8.0
Oppo A9,53.0,6.188679,2.010134,4.0,4.0,8.0,8.0,8.0
Redmi A9,59.0,5.152542,0.99678,4.0,4.0,6.0,6.0,6.0
Samsung Galaxy A50,48.0,6.0,2.021165,4.0,4.0,6.0,8.0,8.0
Samsung Galaxy A70,45.0,4.977778,1.738454,4.0,4.0,4.0,4.0,8.0


In [13]:
# Fill null values in the 'RAM' column with the mean 'RAM' value for the corresponding 'Brand' group
data['RAM'] = data['RAM'].fillna(data.groupby('Brand')['RAM'].transform('mean'))

In [14]:
data

Unnamed: 0,Brand,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($)
0,Oppo A17,4.3,4.000000,64.0,6.4,64.0,48.0,6500.0,649
1,Vivo Y81,3.4,4.000000,48.0,5.8,64.0,48.0,5300.0,750
2,Redmi A9,4.3,4.000000,64.0,6.1,48.0,32.0,4000.0,449
3,Samsung Galaxy A50,4.4,4.000000,64.0,6.4,48.0,32.0,5500.0,609
4,Nokia 415,4.5,2.000000,24.0,3.7,16.0,12.0,3000.0,249
...,...,...,...,...,...,...,...,...,...
1342,Samsung Galaxy A50,4.6,8.000000,,6.4,48.0,32.0,5500.0,649
1343,Apple iPhone 11 Pro,4.1,4.536585,128.0,6.3,64.0,48.0,6500.0,999
1344,Samsung Galaxy A50,4.6,8.000000,128.0,6.4,48.0,32.0,,649
1360,Apple iPhone 12,3.5,6.000000,,6.2,36.0,,,849


In [15]:
data.isnull().sum()

Brand              0
Ratings           29
RAM                0
ROM               86
Mobile_Size       86
Primary_Cam      104
Selfi_Cam         95
Battery_Power    104
Price($)           0
dtype: int64

In [16]:
# Fill null values in the 'Mobile_Size' column with the mean 'Mobile_Size' value for the corresponding 'Brand' group
data['Mobile_Size'] = data['Mobile_Size'].fillna(data.groupby('Brand')['Mobile_Size'].transform('mean'))

In [17]:
# Fill null values in the 'Battery_Power' column with the mean 'Battery_Power' value for the corresponding 'Brand' group
data['Battery_Power'] = data['Battery_Power'].fillna(data.groupby('Brand')['Battery_Power'].transform('mean'))

In [18]:
# fill null values in the 'ROM' column based on the mean 'ROM' value for each unique combination of 'Brand' and 'ROM'
data['ROM'] = data['ROM'].fillna(data.groupby(['Brand','RAM'])['ROM'].transform('mean'))

In [19]:
# Fill null values in the 'Primary_Cam' column with the mean 'Primary_Cam' value for the corresponding 'Brand' group
data['Primary_Cam'] = data['Primary_Cam'].fillna(data.groupby('Brand')['Primary_Cam'].transform('mean'))

In [20]:
# Fill null values in the 'Selfi_Cam' column with the mean 'Selfi_Cam' value for the corresponding 'Brand' group
data['Selfi_Cam'] = data['Selfi_Cam'].fillna(data.groupby('Brand')['Selfi_Cam'].transform('mean'))

In [21]:
data.isnull().sum()

Brand             0
Ratings          29
RAM               0
ROM               0
Mobile_Size       0
Primary_Cam       0
Selfi_Cam         0
Battery_Power     0
Price($)          0
dtype: int64

In [22]:
data['Company'] = data['Brand'].apply(lambda x:" ".join(x.split()[0:1]))

In [23]:
data['Company'].value_counts()

Company
Samsung    148
Apple      141
Vivo       140
Redmi      126
Oppo       123
Nokia      121
Name: count, dtype: int64

In [24]:
data.head()

Unnamed: 0,Brand,Ratings,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company
0,Oppo A17,4.3,4.0,64.0,6.4,64.0,48.0,6500.0,649,Oppo
1,Vivo Y81,3.4,4.0,48.0,5.8,64.0,48.0,5300.0,750,Vivo
2,Redmi A9,4.3,4.0,64.0,6.1,48.0,32.0,4000.0,449,Redmi
3,Samsung Galaxy A50,4.4,4.0,64.0,6.4,48.0,32.0,5500.0,609,Samsung
4,Nokia 415,4.5,2.0,24.0,3.7,16.0,12.0,3000.0,249,Nokia


In [25]:
data = data.drop(columns=['Ratings'])

In [26]:
data.head()

Unnamed: 0,Brand,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company
0,Oppo A17,4.0,64.0,6.4,64.0,48.0,6500.0,649,Oppo
1,Vivo Y81,4.0,48.0,5.8,64.0,48.0,5300.0,750,Vivo
2,Redmi A9,4.0,64.0,6.1,48.0,32.0,4000.0,449,Redmi
3,Samsung Galaxy A50,4.0,64.0,6.4,48.0,32.0,5500.0,609,Samsung
4,Nokia 415,2.0,24.0,3.7,16.0,12.0,3000.0,249,Nokia


In [27]:
my_data = data.drop(columns=['Brand'])

In [28]:
data.head()

Unnamed: 0,Brand,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company
0,Oppo A17,4.0,64.0,6.4,64.0,48.0,6500.0,649,Oppo
1,Vivo Y81,4.0,48.0,5.8,64.0,48.0,5300.0,750,Vivo
2,Redmi A9,4.0,64.0,6.1,48.0,32.0,4000.0,449,Redmi
3,Samsung Galaxy A50,4.0,64.0,6.4,48.0,32.0,5500.0,609,Samsung
4,Nokia 415,2.0,24.0,3.7,16.0,12.0,3000.0,249,Nokia


In [29]:
my_data.head()

Unnamed: 0,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company
0,4.0,64.0,6.4,64.0,48.0,6500.0,649,Oppo
1,4.0,48.0,5.8,64.0,48.0,5300.0,750,Vivo
2,4.0,64.0,6.1,48.0,32.0,4000.0,449,Redmi
3,4.0,64.0,6.4,48.0,32.0,5500.0,609,Samsung
4,2.0,24.0,3.7,16.0,12.0,3000.0,249,Nokia


In [30]:
# one hot encoding
my_data = pd.get_dummies(my_data)

In [31]:
my_data.head()

Unnamed: 0,RAM,ROM,Mobile_Size,Primary_Cam,Selfi_Cam,Battery_Power,Price($),Company_Apple,Company_Nokia,Company_Oppo,Company_Redmi,Company_Samsung,Company_Vivo
0,4.0,64.0,6.4,64.0,48.0,6500.0,649,False,False,True,False,False,False
1,4.0,48.0,5.8,64.0,48.0,5300.0,750,False,False,False,False,False,True
2,4.0,64.0,6.1,48.0,32.0,4000.0,449,False,False,False,True,False,False
3,4.0,64.0,6.4,48.0,32.0,5500.0,609,False,False,False,False,True,False
4,2.0,24.0,3.7,16.0,12.0,3000.0,249,False,True,False,False,False,False


In [32]:
my_data.shape

(799, 13)

## Model Building

In [33]:
# split data set into x and y
x = my_data.drop('Price($)', axis=1)
y = my_data['Price($)']

In [34]:
!pip install sklearn




[notice] A new release of pip is available: 23.2.1 -> 23.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
# split data sets into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [36]:
x_train.shape, x_test.shape

((599, 12), (200, 12))

In [37]:
# Create a function to find accuray of a given model (with default hyper parameters)
def model_acc(model):
    model.fit(x_train, y_train)
    acc = model.score(x_test, y_test)
    print(str(model) + '-->' + str(acc))

In [38]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model_acc(lr)

from sklearn.linear_model import Lasso
lasso = Lasso()
model_acc(lasso)

from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
model_acc(dt)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
model_acc(rf)

LinearRegression()-->0.9841569615544035
Lasso()-->0.9854502689868402
DecisionTreeRegressor()-->0.9982337412700366
RandomForestRegressor()-->0.9982700059367957


In [39]:
# Hyper parameter tunning
from sklearn.model_selection import GridSearchCV

# parameters = {
#     'criterion': ['mse', 'friedman_mse', 'mae'],  # You can adjust these criteria
#     'splitter': ['best', 'random'],  # Splitting strategy
#     'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
#     'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
# }
parameters = {'criterion':['squared_error','absolute_error','poisson'],
             'max_depth': [None, 10, 20, 30],
             'min_samples_split': [2, 5, 10],
             'min_samples_leaf': [1, 2, 4],
             'splitter': ['best', 'random'],}

# grid_obj = GridSearchCV(estimator = dt, param_grid= parameters, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_obj = GridSearchCV(estimator = dt, param_grid= parameters)

grid_fit = grid_obj.fit(x_train, y_train)
best_params = grid_fit.best_params_
my_model = grid_fit.best_estimator_

print(best_params)
print(my_model)

{'criterion': 'squared_error', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'}
DecisionTreeRegressor(max_depth=30, splitter='random')


In [40]:
# get accuracy when we train the model by best model
my_model.score(x_train, y_train)

0.9975319977140725

In [41]:
# save the model using pickle
import pickle
with open("my_predictor.pickle", 'wb') as file:
    pickle.dump(my_model, file)

In [42]:
x_train.columns

Index(['RAM', 'ROM', 'Mobile_Size', 'Primary_Cam', 'Selfi_Cam',
       'Battery_Power', 'Company_Apple', 'Company_Nokia', 'Company_Oppo',
       'Company_Redmi', 'Company_Samsung', 'Company_Vivo'],
      dtype='object')

In [43]:
# get a prediction using my model
my_model.predict([[4, 64, 6.1, 64, 48, 4000, 0, 0, 1, 0, 0, 0]])



array([449.])

In [44]:
my_model.predict([[4, 64, 6.1, 64, 48, 4000, False, False, False, False, True, False]])



array([449.])

In [45]:
my_model.predict([[4, 64, 6.1, 64, 48, 4000, 0, 0, 0, 1, 0, 0]])



array([449.])