# Mobile Price Classification(https://www.kaggle.com/iabhishekofficial/mobile-price-classification)
---

## Setup & Imports

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
%config InlineBackend.figure_format = 'retina'

from src import *

np.set_printoptions(threshold=np.inf)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

## Data

In [2]:
#load and look at the data
DATASET_PATH = './datasets/mobile_price_classification'

train = pd.read_csv(DATASET_PATH + '/train.csv')
test = pd.read_csv(DATASET_PATH + '/test.csv')

train["type"] = "train"
test["type"] = "test"
df = pd.concat([train, test], axis=0)
df = df.drop(columns="id")

print(df.shape)

df.head()

(3000, 22)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,type
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1.0,train
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2.0,train
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2.0,train
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2.0,train
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1.0,train


In [3]:
config = {
    "num_col_names": [
        "battery_power", 
        "clock_speed", 
        "int_memory", 
        "m_dep",
        "mobile_wt", 
        "px_height",
        "px_width",
        "ram",
        "sc_h",
        "sc_w",
        "talk_time",
    ],
    "cat_col_names": [
        "blue", 
        "dual_sim", 
        "fc", 
        "four_g", 
        "n_cores", 
        "pc", 
        "three_g",
        "touch_screen",
        "wifi",
    ],
    "target": ["price_range"],
    "n_splits": 5,
    "shuffle": True,
    "SEED": 1234,
}

df[config["num_col_names"]] = df[config["num_col_names"]].astype("float")
df[config["cat_col_names"]] = df[config["cat_col_names"]].astype("category")
df[config["target"]] = df[config["target"]].astype("category")

## Process missing values

In [4]:
df = categorical_imputer(
    df=df, 
    cat_col_names=config["cat_col_names"]
)
#df = drop_missing_data(df)
print(df.shape)

(3000, 22)


  res = method(*args, **kwargs)


## Categorical encoding

In [5]:
df = rarelabel_encoder(
    df=df, 
    cat_col_names=config["cat_col_names"]
)
df = ordinal_encoder(
    df=df, 
    cat_col_names=config["cat_col_names"]
)
print(df.shape)
df.head()

(3000, 22)


  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)
  "considered frequent".format(var)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,type
0,842.0,0,2.2,0,0,0,7.0,0.6,188.0,0,0,20.0,756.0,2549.0,9.0,7.0,19.0,0,0,0,1.0,train
1,1021.0,1,0.5,1,1,1,53.0,0.7,136.0,1,0,905.0,1988.0,2631.0,17.0,3.0,7.0,1,1,1,2.0,train
2,563.0,1,0.5,1,2,1,41.0,0.9,145.0,2,0,1263.0,1716.0,2603.0,11.0,2.0,9.0,1,1,1,2.0,train
3,615.0,1,2.5,0,1,0,10.0,0.8,131.0,3,0,1216.0,1786.0,2769.0,16.0,8.0,11.0,1,0,1,2.0,train
4,1821.0,1,1.2,0,2,1,44.0,0.6,141.0,0,0,1208.0,1212.0,1411.0,8.0,2.0,15.0,1,1,1,1.0,train


## Feature creation

In [6]:
#df = create_math_transforms(
#)

## Numerical transformer

In [7]:
df = equal_freq_discretiser(
    df=df, 
    num_col_names=config["num_col_names"]
)
df = variable_transformer(
    df=df, 
    num_col_names=config["num_col_names"],
    variable_type="power_transformer"
)
print(df.shape)
df.head()

(3000, 44)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,type,battery_power_disc,clock_speed_disc,int_memory_disc,m_dep_disc,mobile_wt_disc,px_height_disc,px_width_disc,ram_disc,sc_h_disc,sc_w_disc,talk_time_disc,battery_power_power_transformer,clock_speed_power_transformer,int_memory_power_transformer,m_dep_power_transformer,mobile_wt_power_transformer,px_height_power_transformer,px_width_power_transformer,ram_power_transformer,sc_h_power_transformer,sc_w_power_transformer,talk_time_power_transformer
0,842.0,0,2.2,0,0,0,7.0,0.6,188.0,0,0,20.0,756.0,2549.0,9.0,7.0,19.0,0,0,0,1.0,train,2,5,0,4,8,0,1,6,2,5,8,29.017236,1.48324,2.645751,0.774597,13.711309,4.472136,27.495454,50.487622,3.0,2.645751,4.358899
1,1021.0,1,0.5,1,1,1,53.0,0.7,136.0,1,0,905.0,1988.0,2631.0,17.0,3.0,7.0,1,1,1,2.0,train,3,0,8,5,4,7,9,6,7,1,2,31.953091,0.707107,7.28011,0.83666,11.661904,30.083218,44.586994,51.293274,4.123106,1.732051,2.645751
2,563.0,1,0.5,1,2,1,41.0,0.9,145.0,2,0,1263.0,1716.0,2603.0,11.0,2.0,9.0,1,1,1,2.0,train,0,0,6,7,5,8,8,6,3,1,3,23.727621,0.707107,6.403124,0.948683,12.041595,35.538711,41.42463,51.019604,3.316625,1.414214,3.0
3,615.0,1,2.5,0,1,0,10.0,0.8,131.0,3,0,1216.0,1786.0,2769.0,16.0,8.0,11.0,1,0,1,2.0,train,0,6,1,6,4,8,8,6,7,5,4,24.799194,1.581139,3.162278,0.894427,11.445523,34.871192,42.261093,52.621288,4.0,2.828427,3.316625
4,1821.0,1,1.2,0,2,1,44.0,0.6,141.0,0,0,1208.0,1212.0,1411.0,8.0,2.0,15.0,1,1,1,1.0,train,8,1,6,4,5,8,4,3,1,1,6,42.673177,1.095445,6.63325,0.774597,11.874342,34.756294,34.81379,37.56328,2.828427,1.414214,3.872983


## Outliers

In [8]:
df = censor_outliers(
    df=df, 
    num_col_names=config["num_col_names"]
)
print(df.shape)
df.head()

(3000, 44)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,type,battery_power_disc,clock_speed_disc,int_memory_disc,m_dep_disc,mobile_wt_disc,px_height_disc,px_width_disc,ram_disc,sc_h_disc,sc_w_disc,talk_time_disc,battery_power_power_transformer,clock_speed_power_transformer,int_memory_power_transformer,m_dep_power_transformer,mobile_wt_power_transformer,px_height_power_transformer,px_width_power_transformer,ram_power_transformer,sc_h_power_transformer,sc_w_power_transformer,talk_time_power_transformer
0,842.0,0,2.2,0,0,0,7.0,0.6,188.0,0,0,20.0,756.0,2549.0,9.0,7.0,19.0,0,0,0,1.0,train,2,5,0,4,8,0,1,6,2,5,8,29.017236,1.48324,2.645751,0.774597,13.711309,4.472136,27.495454,50.487622,3.0,2.645751,4.358899
1,1021.0,1,0.5,1,1,1,53.0,0.7,136.0,1,0,905.0,1988.0,2631.0,17.0,3.0,7.0,1,1,1,2.0,train,3,0,8,5,4,7,9,6,7,1,2,31.953091,0.707107,7.28011,0.83666,11.661904,30.083218,44.586994,51.293274,4.123106,1.732051,2.645751
2,563.0,1,0.5,1,2,1,41.0,0.9,145.0,2,0,1263.0,1716.0,2603.0,11.0,2.0,9.0,1,1,1,2.0,train,0,0,6,7,5,8,8,6,3,1,3,23.727621,0.707107,6.403124,0.948683,12.041595,35.538711,41.42463,51.019604,3.316625,1.414214,3.0
3,615.0,1,2.5,0,1,0,10.0,0.8,131.0,3,0,1216.0,1786.0,2769.0,16.0,8.0,11.0,1,0,1,2.0,train,0,6,1,6,4,8,8,6,7,5,4,24.799194,1.581139,3.162278,0.894427,11.445523,34.871192,42.261093,52.621288,4.0,2.828427,3.316625
4,1821.0,1,1.2,0,2,1,44.0,0.6,141.0,0,0,1208.0,1212.0,1411.0,8.0,2.0,15.0,1,1,1,1.0,train,8,1,6,4,5,8,4,3,1,1,6,42.673177,1.095445,6.63325,0.774597,11.874342,34.756294,34.81379,37.56328,2.828427,1.414214,3.872983


## Drop Features

In [9]:
#df = drop_constant_features(df)
print(df.shape)
df.head()

(3000, 44)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,type,battery_power_disc,clock_speed_disc,int_memory_disc,m_dep_disc,mobile_wt_disc,px_height_disc,px_width_disc,ram_disc,sc_h_disc,sc_w_disc,talk_time_disc,battery_power_power_transformer,clock_speed_power_transformer,int_memory_power_transformer,m_dep_power_transformer,mobile_wt_power_transformer,px_height_power_transformer,px_width_power_transformer,ram_power_transformer,sc_h_power_transformer,sc_w_power_transformer,talk_time_power_transformer
0,842.0,0,2.2,0,0,0,7.0,0.6,188.0,0,0,20.0,756.0,2549.0,9.0,7.0,19.0,0,0,0,1.0,train,2,5,0,4,8,0,1,6,2,5,8,29.017236,1.48324,2.645751,0.774597,13.711309,4.472136,27.495454,50.487622,3.0,2.645751,4.358899
1,1021.0,1,0.5,1,1,1,53.0,0.7,136.0,1,0,905.0,1988.0,2631.0,17.0,3.0,7.0,1,1,1,2.0,train,3,0,8,5,4,7,9,6,7,1,2,31.953091,0.707107,7.28011,0.83666,11.661904,30.083218,44.586994,51.293274,4.123106,1.732051,2.645751
2,563.0,1,0.5,1,2,1,41.0,0.9,145.0,2,0,1263.0,1716.0,2603.0,11.0,2.0,9.0,1,1,1,2.0,train,0,0,6,7,5,8,8,6,3,1,3,23.727621,0.707107,6.403124,0.948683,12.041595,35.538711,41.42463,51.019604,3.316625,1.414214,3.0
3,615.0,1,2.5,0,1,0,10.0,0.8,131.0,3,0,1216.0,1786.0,2769.0,16.0,8.0,11.0,1,0,1,2.0,train,0,6,1,6,4,8,8,6,7,5,4,24.799194,1.581139,3.162278,0.894427,11.445523,34.871192,42.261093,52.621288,4.0,2.828427,3.316625
4,1821.0,1,1.2,0,2,1,44.0,0.6,141.0,0,0,1208.0,1212.0,1411.0,8.0,2.0,15.0,1,1,1,1.0,train,8,1,6,4,5,8,4,3,1,1,6,42.673177,1.095445,6.63325,0.774597,11.874342,34.756294,34.81379,37.56328,2.828427,1.414214,3.872983


## Data split and target transformation

In [10]:
train, test = df[df["type"]=="train"].drop(columns="type"), df[df["type"]=="test"].drop(columns="type")

train = target_transformer(
    df=train, 
    target=config["target"], 
)
print(train.shape)

train, val, test = data_splitting(
    df=train,
    target=config["target"],
    n_splits=config["n_splits"],
    shuffle=config["shuffle"],
    random_state=config["SEED"]
)
print(train.shape, val.shape, test.shape)

(2000, 43)
(1280, 43) (320, 43) (400, 43)


## Training

In [11]:
import lightgbm as lgm

model = lgm.LGBMClassifier(
    max_depth=4,
    random_state=config["SEED"]
)

trainer = Trainer(
    model=model,
    target=config["target"],
    random_state=config["SEED"]
)

trainer.fit(train, val)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[1]	valid_0's multi_logloss: 1.21259
[2]	valid_0's multi_logloss: 1.07937
[3]	valid_0's multi_logloss: 0.975322
[4]	valid_0's multi_logloss: 0.894108
[5]	valid_0's multi_logloss: 0.824975
[6]	valid_0's multi_logloss: 0.764833
[7]	valid_0's multi_logloss: 0.718629
[8]	valid_0's multi_logloss: 0.677307
[9]	valid_0's multi_logloss: 0.642426
[10]	valid_0's multi_logloss: 0.613323
[11]	valid_0's multi_logloss: 0.587083
[12]	valid_0's multi_logloss: 0.561068
[13]	valid_0's multi_logloss: 0.539122
[14]	valid_0's multi_logloss: 0.517833
[15]	valid_0's multi_logloss: 0.494948
[16]	valid_0's multi_logloss: 0.478798
[17]	valid_0's multi_logloss: 0.465509
[18]	valid_0's multi_logloss: 0.451839
[19]	valid_0's multi_logloss: 0.438643
[20]	valid_0's multi_logloss: 0.425598
[21]	valid_0's multi_logloss: 0.413548
[22]	valid_0's multi_logloss: 0.401813
[23]	valid_0's multi_logloss: 0.39051
[24]	valid_0's multi_logloss: 0.382542
[25]	valid_0's multi_logloss: 0.375419
[26]	valid_0's multi_logloss: 0.36887

## Evaluation

In [12]:
trainer.evaluate(test)

confusion_matrix:
 [[90  5  0  5]
 [ 9 88  3  0]
 [ 0  8 92  0]
 [ 5  0  0 95]]
classification report:
               precision    recall  f1-score   support

           0       0.87      0.90      0.88       100
           1       0.87      0.88      0.88       100
           2       0.97      0.92      0.94       100
           3       0.95      0.95      0.95       100

    accuracy                           0.91       400
   macro avg       0.91      0.91      0.91       400
weighted avg       0.91      0.91      0.91       400

AUC: 0.98875


## Prediction

In [13]:
y_pred = trainer.predict(test.drop(columns=config["target"]))