In [1]:
import time
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
import sqlite3
import traceback, sys
%matplotlib inline
sns.set(color_codes=True)

In [2]:
def get_data_from_consumption(db_path, table_name):
    try:
        conn = sqlite3.connect(db_path)
        print("successfully connected to database")
            
    except:
        print("Error in connecting database!")
        print("-"*60)
        traceback.print_exc(file=sys.stdout)
        print("-"*60)
    df = pd.read_sql('SELECT * FROM {}'.format(table_name), conn)
    conn.close()
    return df

In [3]:
df = get_data_from_consumption("../../database/data.db", "CONSUMPTION")
df.head()

successfully connected to database


Unnamed: 0,unique_data,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,RawTimeStamp,ConsumptionTimeStamp
0,2549.026455,842,0,2.2,0,1,0,7,0.6,188,...,2549,9,7,19,0,0,1,1,2023-05-29 12:46:43,2023-05-29 14:32:00
1,2631.455231,1021,1,0.5,1,0,1,53,0.7,136,...,2631,17,3,7,1,1,0,2,2023-05-29 12:46:43,2023-05-29 14:32:00
2,2603.736014,563,1,0.5,1,2,1,41,0.9,145,...,2603,11,2,9,1,1,0,2,2023-05-29 12:46:43,2023-05-29 14:32:00
3,2769.680851,615,1,2.5,0,0,0,10,0.8,131,...,2769,16,8,11,1,0,0,2,2023-05-29 12:46:43,2023-05-29 14:32:00
4,1411.9967,1821,1,1.2,0,13,1,44,0.6,141,...,1411,8,2,15,1,1,0,1,2023-05-29 12:46:43,2023-05-29 14:32:00


In [4]:
df = df.drop(['unique_data', 'RawTimeStamp', 'ConsumptionTimeStamp'], axis=1)
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
df.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [7]:
get_unique_values = {}
for i in df.columns:
    get_unique_values[i] = {
        "count": len(df[i].unique()),
        "items": df[i].unique()
    }
print(get_unique_values)

{'battery_power': {'count': 1094, 'items': array([ 842, 1021,  563, ..., 1139, 1467,  858])}, 'blue': {'count': 2, 'items': array([0, 1])}, 'clock_speed': {'count': 26, 'items': array([2.2, 0.5, 2.5, 1.2, 1.7, 0.6, 2.9, 2.8, 2.1, 1. , 0.9, 1.1, 2.6,
       1.4, 1.6, 2.7, 1.3, 2.3, 2. , 1.8, 3. , 1.5, 1.9, 2.4, 0.8, 0.7])}, 'dual_sim': {'count': 2, 'items': array([0, 1])}, 'fc': {'count': 20, 'items': array([ 1,  0,  2, 13,  3,  4,  5,  7, 11, 12, 16,  6, 15,  8,  9, 10, 18,
       17, 14, 19])}, 'four_g': {'count': 2, 'items': array([0, 1])}, 'int_memory': {'count': 63, 'items': array([ 7, 53, 41, 10, 44, 22, 24,  9, 33, 17, 52, 46, 13, 23, 49, 19, 39,
       47, 38,  8, 57, 51, 21,  5, 60, 61,  6, 11, 50, 34, 20, 27, 42, 40,
       64, 14, 63, 43, 16, 48, 12, 55, 36, 30, 45, 29, 58, 25,  3, 54, 15,
       37, 31, 32,  4, 18,  2, 56, 26, 35, 59, 28, 62])}, 'm_dep': {'count': 10, 'items': array([0.6, 0.7, 0.9, 0.8, 0.1, 0.5, 1. , 0.3, 0.4, 0.2])}, 'mobile_wt': {'count': 121, 'items': ar

In [8]:
for i, j in get_unique_values.items():
    if j['count'] < 10:
        print(f"{i} ---> {j['count']}")

blue ---> 2
dual_sim ---> 2
four_g ---> 2
n_cores ---> 8
three_g ---> 2
touch_screen ---> 2
wifi ---> 2
price_range ---> 4


In [9]:
for cols, value in get_unique_values.items():
    if value["count"] < 10:
        df[cols] = df[cols].astype("object")


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   object 
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   object 
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   object 
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   object 
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   object 
 18  touch_sc

In [11]:
df["talk_time"]

0       19
1        7
2        9
3       11
4       15
        ..
1995    19
1996    16
1997     5
1998    19
1999     2
Name: talk_time, Length: 2000, dtype: int64

In [12]:
X = df.drop(["price_range"], axis=1)
y = df["price_range"]

In [13]:
y = np.array(y, dtype=np.float32)
y

array([1., 2., 2., ..., 3., 0., 3.], dtype=float32)

In [14]:
categorical_data = [i for i in range(len(X.columns)) if df[X.columns[i]].dtype == "O"]
categorical_data

[1, 3, 5, 9, 17, 18, 19]

In [15]:
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), categorical_data)
    ], 
    remainder='passthrough'
)
X = np.array(ct.fit_transform(X))

In [16]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [18]:
X_train.shape

(1600, 33)

# Modelling

In [19]:
trainedforest = RandomForestClassifier()
trainedforest.fit(X_train, y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(classification_report(y_test,predictionforest))

[[101   4   0   0]
 [  6  76   9   0]
 [  0  11  70  11]
 [  0   0  12 100]]
              precision    recall  f1-score   support

         0.0       0.94      0.96      0.95       105
         1.0       0.84      0.84      0.84        91
         2.0       0.77      0.76      0.77        92
         3.0       0.90      0.89      0.90       112

    accuracy                           0.87       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.87      0.87      0.87       400



In [20]:
grid = { 
    'n_estimators': [100, 200,300,400],
    'max_features': ['sqrt', 'log2', None],
    'max_depth' : [None,4,5,6,7,8,9,10],
    'criterion' :['gini', 'entropy', 'log_loss'],
    'random_state' : [18, None, 42],
    'min_samples_split': [10, 20, 30],
    'min_samples_leaf': [20, 30, 40, 50]
}
rf_cv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid, cv= 5, n_jobs=3)
rf_cv.fit(X_train, y_train)

In [None]:
rf_cv.best_estimator_

In [None]:
trainedforest = RandomForestClassifier(criterion='entropy', n_estimators=400, random_state=42)
trainedforest.fit(X_train, y_train)
predictionforest = trainedforest.predict(X_test)
print(confusion_matrix(y_test,predictionforest))
print(classification_report(y_test,predictionforest))

[[100   5   0   0]
 [  6  76   9   0]
 [  0   7  78   7]
 [  0   0  11 101]]
              precision    recall  f1-score   support

         0.0       0.94      0.95      0.95       105
         1.0       0.86      0.84      0.85        91
         2.0       0.80      0.85      0.82        92
         3.0       0.94      0.90      0.92       112

    accuracy                           0.89       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.89      0.89      0.89       400

