In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

In [2]:
df = pd.read_csv('gsm_clean.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,oem,model,battery_charging,platform_cpu,platform_gpu,body_sim,network_technology,display_type,main_camera_video,...,display_resolution,display_size,sound_3.5mm_jack,platform_os,platform_chipset,misc_price,memory,ram,removability,battery_capacity
0,0,Google,Pixel 4 XL,18.0,8.0,Adreno 640,Mini-SIM,LTE,P-OLED,4K,...,1440 x 3040,6.3,No,Android,Qualcomm SM8150 Snapdragon 855,479.99,64,6,Non-removable,3700
1,1,Google,Pixel 4,18.0,8.0,Adreno 640,Mini-SIM,LTE,P-OLED,4K,...,1080 x 2280,5.7,No,Android,Qualcomm SM8150 Snapdragon 855,524.52,64,6,Non-removable,2800
2,2,Google,Pixel 3a XL,18.0,8.0,Adreno 615,Mini-SIM,LTE,OLED,4K,...,1080 x 2160,6.0,Yes,Android,Qualcomm SDM670 Snapdragon 670,261.6,64,4,Non-removable,3700
3,3,Google,Pixel 3a,18.0,8.0,Adreno 615,Mini-SIM,LTE,OLED,4K,...,1080 x 2220,5.6,Yes,Android,Qualcomm SDM670 Snapdragon 670,212.16,64,4,Non-removable,3000
4,4,Huawei,Enjoy 20 Pro,22.5,8.0,Mali-G57,Nano-SIM card & eSIM,5G,IPS LCD,4K,...,1080 x 2400,6.5,Yes,Android,MediaTek MT6873V Dimensity 800 5G,250.0,128,6,Non-removable,4000


In [3]:
print("Data type : ", type(df))
print("Data dims : ", df.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (913, 22)


In [4]:
# Extract Response and Predictors
Y = df['misc_price'].fillna(df['misc_price'].mean()).values # Target for the model
X = df.drop(['misc_price'], axis=1) # Features we use

## Linear Regression Model

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import category_encoders as ce
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
# splitting into two sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)

target_enc = ce.CatBoostEncoder()
target_enc.fit(X_train, Y_train)

#Transform the features, rename columns with _cb suffix, and join to dataframe
train_CBE = target_enc.transform(X_train)
test_CBE = target_enc.transform(X_test)


In [6]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(train_CBE))
imputed_X_test = pd.DataFrame(my_imputer.transform(test_CBE))


imputed_X_train.columns = train_CBE.columns
imputed_X_test.columns = test_CBE.columns

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

lr_model = LinearRegression()
lr_model.fit(imputed_X_train, Y_train)
predictions = lr_model.predict(imputed_X_test)

mae = mean_absolute_error(predictions, Y_test)
r2score = r2_score(Y_test, predictions)

print("Validation MAE for Linear Regression Model: {}".format(mae))
print("Validation Accuracy for Linear Regression Model: {}".format(r2score))
# Explained Variance (R^2)
print("Explained Variance (R^2) \t:", lr_model.score(imputed_X_train, Y_train))
output = pd.DataFrame({'Actual': Y_test, 'Predicted': predictions})
output

Validation MAE for Linear Regression Model: 82.61027225680354
Validation Accuracy for Linear Regression Model: 0.7425778133412574
Explained Variance (R^2) 	: 0.8193580102593165


Unnamed: 0,Actual,Predicted
0,170.00,190.239528
1,320.00,261.428941
2,499.00,389.512856
3,240.00,199.433312
4,200.00,214.829681
...,...,...
224,249.99,195.243925
225,344.98,306.147609
226,690.00,530.650459
227,700.00,457.032219


## Random Forest Tree Model

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# To improve accuracy, create a new Random Forest model and train on the data
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(imputed_X_train, Y_train)
predictions = rf_model.predict(imputed_X_test)

mae = mean_absolute_error(predictions, Y_test)
r2score = r2_score(Y_test, predictions)

print("Validation MAE for Random Forest Model: {}".format(mae))
print("Validation Accuracy for Random Forest Model: {}".format(r2score))

output = pd.DataFrame({'Actual': Y_test, 'Predicted': predictions})
output.head(20)

Validation MAE for Random Forest Model: 68.80077362445414
Validation Accuracy for Random Forest Model: 0.785480240239157


Unnamed: 0,Actual,Predicted
0,170.0,173.7096
1,320.0,286.272
2,499.0,450.1765
3,240.0,213.27
4,200.0,218.5647
5,198.31,207.5093
6,400.0,377.21836
7,250.0,220.9
8,108.96,111.8712
9,552.79,521.1161


# XGBoost

In [9]:
#conda install py-xgboost (Install xgboost)
#conda install -c conda-forge python-graphviz (Install graphviz)
from xgboost import XGBRegressor
from sklearn import tree
from xgboost import XGBClassifier
from xgboost import plot_tree
xgb_model = XGBRegressor(n_estimators=10000, max_depth = 4,colsample_bytree = 0.2,gamma = 1)
# Add silent=True to avoid printing out updates with each cycle
xgb_model.fit(imputed_X_train, Y_train, verbose=False)
predictions = xgb_model.predict(imputed_X_test)

mae = mean_absolute_error(predictions, Y_test)
r2score = r2_score(Y_test, predictions)

print("Validation MAE for XGBoost Model: {}".format(mae))
print("Validation Accuracy for XGBoost Model: {}".format(r2score))

output = pd.DataFrame({'Actual': Y_test, 'Predicted': predictions})
output.head(20)

Validation MAE for XGBoost Model: 54.0784466035668
Validation Accuracy for XGBoost Model: 0.8657782787614177


Unnamed: 0,Actual,Predicted
0,170.0,177.297394
1,320.0,292.226685
2,499.0,465.926544
3,240.0,201.13797
4,200.0,190.060791
5,198.31,192.109039
6,400.0,504.600403
7,250.0,213.711975
8,108.96,99.542702
9,552.79,549.042847
