In [1]:
import kagglehub
import os
import pandas as pd

import seaborn as sns
import numpy as np

from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# Download latest version
path = kagglehub.dataset_download("iabhishekofficial/mobile-price-classification")

data = pd.read_csv(os.path.join(path, os.listdir(path)[-1]))
data.head()

  from .autonotebook import tqdm as notebook_tqdm




Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [6]:
X, y = data.drop('price_range', axis=1), data['price_range']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
clf_lr = LogisticRegression(max_iter=1000)
params_lr = {'solver': ['liblinear', 'lbfgs'], 'C': [10**(i) for i in range(-4, 3)]}

gridcv = GridSearchCV(clf_lr, param_grid=params_lr, scoring='accuracy', cv=3)    

gridcv.fit(X_train, y_train)

pd.DataFrame(gridcv.cv_results_)[['params', 'mean_test_score', 'std_test_score']].sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,params,mean_test_score,std_test_score
13,"{'C': 100, 'solver': 'lbfgs'}",0.962667,0.009286
11,"{'C': 10, 'solver': 'lbfgs'}",0.948667,0.010499
9,"{'C': 1, 'solver': 'lbfgs'}",0.870667,0.019345
12,"{'C': 100, 'solver': 'liblinear'}",0.836,0.028331
10,"{'C': 10, 'solver': 'liblinear'}",0.814667,0.023795
8,"{'C': 1, 'solver': 'liblinear'}",0.749333,0.028111
7,"{'C': 0.1, 'solver': 'lbfgs'}",0.726667,0.022881
6,"{'C': 0.1, 'solver': 'liblinear'}",0.643333,0.013888
5,"{'C': 0.01, 'solver': 'lbfgs'}",0.622667,0.009978
4,"{'C': 0.01, 'solver': 'liblinear'}",0.537333,0.008994


In [22]:


clf_lr = LogisticRegression(random_state=22, max_iter=2000)
clf_rf = RandomForestClassifier(random_state=22)
clf_svc = SVC(random_state=22)


param_grid_lr = {'solver': ['liblinear', 'lbfgs'], 'C': np.logspace(-4, 2, 9)}

param_grid_rf = [{'n_estimators': [10, 50, 100, 250, 500, 1000], 
                  'min_samples_leaf': [1, 3, 5], 
                  'max_features': ['sqrt', 'log2']}]

param_grid_svc = [{'kernel': ['rbf'], 'C': np.logspace(-4, 4, 9), 'gamma': np.logspace(-4, 0, 4)},
                  {'kernel': ['linear'], 'C': np.logspace(-4, 4, 9)}]


gridcvs = {}

for pgrid, clf, name in zip((param_grid_lr, param_grid_rf, param_grid_svc),
                            (clf_lr, clf_rf, clf_svc),
                            ('LogisticRegression', 'RF', 'SVM')):
    gcv = GridSearchCV(clf, pgrid, cv=3, refit=True)
    gridcvs[name] = gcv



outer_cv = StratifiedKFold(n_splits=3, shuffle=True)
outer_scores = {}

for name, gs in gridcvs.items():
    nested_score = cross_val_score(gs, X_train, y_train, cv=outer_cv)
    outer_scores[name] = nested_score
    print(f'{name}: outer accuracy {100*nested_score.mean():.2f} +/- {100*nested_score.std():.2f}')

LogisticRegression: outer accuracy 96.07 +/- 0.77
RF: outer accuracy 88.00 +/- 1.07
SVM: outer accuracy 95.80 +/- 0.16


In [23]:
from sklearn.metrics import accuracy_score

final_clf = gridcvs['LogisticRegression']
final_clf.fit(X_train, y_train)

print(f'Best Parameters: {final_clf.best_params_}')

train_acc = accuracy_score(y_true=y_train, y_pred=final_clf.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=final_clf.predict(X_test))

print(f'Training Accuracy: {100*train_acc:.2f}')
print(f'Test Accuracy: {100*test_acc:.2f}')

Best Parameters: {'C': 100.0, 'solver': 'lbfgs'}
Training Accuracy: 98.40
Test Accuracy: 95.40


In [7]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Define the models
model_svc = SVC()
model_rf = RandomForestClassifier()

# Define the methods
# GridSearch
param_grid_svc = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
param_grid_rf = {'n_estimators': [100, 200], 'max_depth': [10, 20]}

# RandomizedSearch
param_dist_rf = {'n_estimators': [100, 200, 300], 'max_depth': [10, 20, 30]}

# BayesSearch
param_grid_bayes_rf = {
    'n_estimators': (100, 300),
    'max_depth': (10, 50)
}

# Initialize the different methods
grid_search = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, cv=5, scoring='accuracy')
random_search = RandomizedSearchCV(estimator=model_rf, param_distributions=param_dist_rf, n_iter=10, cv=5, scoring='accuracy', random_state=42)
bayes_search = BayesSearchCV(estimator=model_rf, search_spaces=param_grid_bayes_rf, n_iter=10, cv=5, scoring='accuracy', random_state=42)

# Perform the search
grid_search.fit(X_train, y_train)
random_search.fit(X_train, y_train)
bayes_search.fit(X_train, y_train)

# Find the best parameters
print(grid_search.best_params_)
print(random_search.best_params_)
print(bayes_search.best_params_)




{'max_depth': 20, 'n_estimators': 200}
{'n_estimators': 100, 'max_depth': 10}
OrderedDict([('max_depth', 39), ('n_estimators', 288)])


In [14]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=9000),
    'RandomForestClassifier': RandomForestClassifier(),
    'SVC': SVC()
}

# Hyperparameters to be tested for each model
param_grids = {
    'LogisticRegression': {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']},
    'RandomForestClassifier': {'n_estimators': [50, 100, 200], 'max_depth': [10, 20, 30]},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

# Dictionary for storing results
results = {}

In [15]:
# Function to compare the 3 hyperparameter search methods
def compare_search_methods(model_name, model, param_grid):
    search_methods = {
    'GridSearchCV': GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy'),
    'RandomizedSearchCV': RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=5, cv=5, scoring='accuracy', random_state=42),
    'BayesSearchCV': BayesSearchCV(estimator=model, search_spaces=param_grid, n_iter=5, cv=5, scoring='accuracy', random_state=42)
    }
    
    results[model_name] = {}

    for search_name, search in search_methods.items():
        # Perform hyperparameter search
        search.fit(X_train, y_train)
        
        # Best score and hyperparameters found
        best_params = search.best_params_
        best_score = search.best_score_
        
        # Test on test data
        y_pred = search.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        
        # Store results
        results[model_name][search_name] = {
            'best_params': best_params,
            'best_cv_score': best_score,
            'test_accuracy': test_accuracy
        }

In [16]:
# Run comparison for each model
for model_name, model in models.items():
    compare_search_methods(model_name, model, param_grids[model_name])

# Print results
for model_name, model_results in results.items():
    print(f"Model: {model_name}")
    for search_name, search_results in model_results.items():
        print(f"  {search_name}:")
        print(f"    Best Params: {search_results['best_params']}")
        print(f"    Best CV Score: {search_results['best_cv_score']:2f}")
        print(f"    Test Accuracy: {search_results['test_accuracy']:.2f}")
    print("\n")


NameError: name 'accuracy_score' is not defined

In [17]:
from sklearn.model_selection import KFold

# Define models to test
models_cv = {
'clf_lr' : LogisticRegression(random_state=22, solver='liblinear', max_iter=3000),
'clf_rf' : RandomForestClassifier(random_state=22),
'clf_svc' : SVC(random_state=22)
}

# Define the number of folds for KFold
n_splits = 5  # For example, for 5-fold cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

In [18]:
# Store results for each model

results = {}

# Loop through each model
for model_name, model in models_cv.items():
    fold_accuracies = []  # Store accuracy results for each fold

    # Perform KFold cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on test set
        y_pred = model.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        fold_accuracies.append(accuracy)

    # Average accuracy results for all folds
    avg_accuracy = sum(fold_accuracies) / len(fold_accuracies)
    results[model_name] = avg_accuracy

NameError: name 'accuracy_score' is not defined

In [19]:
# Print results
for model_name, accuracy in results.items():
    print(f'{model_name}: Average Accuracy = {accuracy:.2f}')
