In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("mobiles.csv")
df.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


### Q1.

In [7]:
ser = pd.Series([1, 3, 5, 8, 12])

In [8]:
np.std(ser)

3.8678159211627436

In [10]:
ser.std()

4.324349662087931

In [12]:
df_q1 = df[["ROM", "RAM", "num_front_camera", "num_rear_camera", "battery_capacity", "sales"]].copy()
df_q1.head(2)

Unnamed: 0,ROM,RAM,num_front_camera,num_rear_camera,battery_capacity,sales
0,64,2,1,1,1800,127.52
1,64,4,1,2,2815,1.39


In [14]:
stat_mean = df_q1["sales"].mean()
stat_std  = df_q1["sales"].std()
stat_out  = stat_mean + stat_std * 2
stat_out

146.55150129273218

In [15]:
df_q1_sub = df_q1.loc[df_q1["sales"] > stat_out, ].reset_index(drop = True)
len(df_q1_sub)

16

In [16]:
df_q1_sub["idx"] = (df_q1_sub["ROM"] / 32) + (df_q1_sub["RAM"] / 2) + \
(df_q1_sub["num_front_camera"] + df_q1_sub["num_rear_camera"]) + \
(df_q1_sub["battery_capacity"] / 1000)

In [17]:
df_q1_sub.head(2)

Unnamed: 0,ROM,RAM,num_front_camera,num_rear_camera,battery_capacity,sales,idx
0,128,6,1,2,4000,231.79,14.0
1,128,6,2,4,4500,427.22,17.5


In [18]:
round(df_q1_sub["idx"].mean(), 2)

11.01

### Q2.

In [21]:
df_q2 = df.loc[df["num_rear_camera"] != 1, ]
df_q2 = df_q2[["sales", "battery_capacity", "ratings", "num_of_ratings", "sales_price", "discount_percent"]]
df_q2.head(2)

Unnamed: 0,sales,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent
1,1.39,2815,4.5,244,57149,0.04
4,5.15,2815,4.6,745,69149,0.02


In [23]:
df_q2.corr().round(2)

Unnamed: 0,sales,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent
sales,1.0,0.03,0.23,0.95,-0.25,0.22
battery_capacity,0.03,1.0,-0.42,0.03,-0.5,0.26
ratings,0.23,-0.42,1.0,0.19,0.15,-0.12
num_of_ratings,0.95,0.03,0.19,1.0,-0.26,0.21
sales_price,-0.25,-0.5,0.15,-0.26,1.0,-0.1
discount_percent,0.22,0.26,-0.12,0.21,-0.1,1.0


In [27]:
df_q2.corr()["sales"].iloc[1:].round(2).abs().max()

0.95

In [29]:
df_q2.corr()["sales"][1:]

battery_capacity    0.025680
ratings             0.226075
num_of_ratings      0.949114
sales_price        -0.247760
discount_percent    0.223471
Name: sales, dtype: float64

In [31]:
df_q2.corr()["sales"].iloc[1:].idxmax()

'num_of_ratings'

In [36]:
df_q2_corr = df_q2.corr().reset_index().melt(id_vars = "index")
df_q2_corr.loc[df_q2_corr["index"] != df_q2_corr["variable"], ]

Unnamed: 0,index,variable,value
1,battery_capacity,sales,0.02568
2,ratings,sales,0.226075
3,num_of_ratings,sales,0.949114
4,sales_price,sales,-0.24776
5,discount_percent,sales,0.223471
6,sales,battery_capacity,0.02568
8,ratings,battery_capacity,-0.424129
9,num_of_ratings,battery_capacity,0.034902
10,sales_price,battery_capacity,-0.503019
11,discount_percent,battery_capacity,0.257373


### Q3.

In [38]:
df_q3 = df.copy()
df_q3.head(2)

Unnamed: 0,screen_size,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Very Small,64,2,1,1,1800,4.5,38645,32999,0.17,127.52
1,Small,64,4,2,1,2815,4.5,244,57149,0.04,1.39


In [39]:
df_q3.shape

(430, 11)

In [40]:
df_q3_dum = pd.get_dummies(df_q3)
df_q3_dum.shape

(430, 15)

In [41]:
df_q3_dum.head(2)

Unnamed: 0,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,64,2,1,1,1800,4.5,38645,32999,0.17,127.52,0,0,0,0,1
1,64,4,2,1,2815,4.5,244,57149,0.04,1.39,0,0,1,0,0


In [42]:
df_q3_dum = df_q3_dum.set_index("sales").reset_index()
df_q3_dum.head(2)

Unnamed: 0,sales,ROM,RAM,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,screen_size_Large,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,127.52,64,2,1,1,1800,4.5,38645,32999,0.17,0,0,0,0,1
1,1.39,64,4,2,1,2815,4.5,244,57149,0.04,0,0,1,0,0


In [43]:
df_train, df_test = train_test_split(df_q3_dum, train_size = 0.8, random_state = 123)
len(df_train), len(df_test)

(344, 86)

In [44]:
model_nor = MinMaxScaler().fit(df_train)
df_train_nor = model_nor.transform(df_train)
df_test_nor  = model_nor.transform(df_test)

In [48]:
k = 3

model_knn = KNeighborsRegressor(n_neighbors = k)
model_knn.fit(X = df_train_nor[:, 1:],
              y = df_train_nor[:, 0])
pred = model_knn.predict(df_test_nor[:, 1:])

val_rmse = mean_squared_error(y_true = df_test_nor[:, 0], 
                              y_pred = pred) ** 0.5
val_rmse

0.08186677375964535

In [50]:
for k in [3, 5, 7, 9, 11]:
    print(k)

3
5
7
9
11


In [51]:
ls_rmse = []
for k in [3, 5, 7, 9, 11]:
    model_knn = KNeighborsRegressor(n_neighbors = k)
    model_knn.fit(X = df_train_nor[:, 1:],
                  y = df_train_nor[:, 0])
    pred = model_knn.predict(df_test_nor[:, 1:])

    val_rmse = mean_squared_error(y_true = df_test_nor[:, 0], 
                                  y_pred = pred) ** 0.5
    ls_rmse = ls_rmse + [val_rmse]

In [52]:
ls_rmse

[0.08186677375964535,
 0.09879109824384892,
 0.107669855645971,
 0.11232111394853059,
 0.1136902366621185]