<a href="https://colab.research.google.com/github/dodozaki/Machine-Learning/blob/main/GridSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import sys
import urllib.request
import os



%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

import sklearn
from sklearn.preprocessing import PolynomialFeatures

from sklearn.pipeline import Pipeline

import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
import urllib.request
data_path = os.path.join("datasets", "")
download_path = "https://raw.githubusercontent.com/AbdelMahm/FSR/master/IDDLO-29-20/Notebooks/datasets/"
os.makedirs(data_path, exist_ok=True)
for filename in ("log_reg_data1.csv", "log_reg_data2.csv"):
    print("Downloading", filename)
    url = download_path + filename
    urllib.request.urlretrieve(url, data_path + filename)

Downloading log_reg_data1.csv
Downloading log_reg_data2.csv


In [6]:
#load data
df = pd.read_csv(data_path + '/log_reg_data1.csv')
df.head()

Unnamed: 0,studentId,score1,score2,admitted
0,1,34.62366,78.024693,0
1,2,30.286711,43.894998,0
2,3,35.847409,72.902198,0
3,4,60.182599,86.308552,1
4,5,79.032736,75.344376,1


In [10]:
#try GridSearch and Randomised Search
x = df.drop(['admitted'],axis=1).values
y = df['admitted'].values

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1, stratify=y)

In [12]:
from sklearn.preprocessing import StandardScaler
feature_scaler = StandardScaler()
X_train = feature_scaler.fit_transform(X_train)
X_test = feature_scaler.transform(X_test)

In [13]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=300, random_state=0)


**Grid Search and Random Forest Classifier**

In [14]:
grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

gd_sr = GridSearchCV(estimator=classifier,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)
#Ici, nous créons un dictionnaire grid_param avec trois paramètres n_estimators, critère et bootstrap. Les valeurs de paramètre
# que nous voulons essayer sont passées dans la liste.
# Par exemple, nous voulons trouver quelle valeur (sur 100, 300, 500, 800 et 1000) fournit la plus grande précision.
#nous voulons trouver quelle valeur donne les performances les plus élevées pour le paramètre critère : « gini » ou « entropie » 

In [15]:
gd_sr.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_estimators=300, random_state=0),
             n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'n_estimators': [100, 300, 500, 800, 1000]},
             scoring='accuracy')

In [20]:
# Print the training score of the best model

print('the training score of the best model',gd_sr.best_score_)

# Print the model parameters of the best model

print('the model parameters of the best model',gd_sr.best_params_)

# Print the test score of the best model

clfRFC = gd_sr.best_estimator_
print('Test accuracy: %.3f' % clfRFC.score(X_test, y_test))

the training score of the best model 0.9285714285714286
the model parameters of the best model {'bootstrap': True, 'criterion': 'gini', 'n_estimators': 100}
Test accuracy: 0.900


**Le résultat montre que la précision la plus élevée est obtenue lorsque les n_estimateurs valent 100, le bootstrap est True et le critère est "gini".**

**Grid Search and Logistic Regression**

In [23]:
pipelineLR = make_pipeline(StandardScaler(), LogisticRegression(random_state=1, penalty='l2', solver='lbfgs'))
#
# Create the parameter grid
#
param_grid_lr = [{
    'logisticregression__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
}]
#
# Create an instance of GridSearch Cross-validation estimator
#
gsLR = GridSearchCV(estimator=pipelineLR,
                     param_grid = param_grid_lr,
                     scoring='accuracy',
                     cv=10,
                     refit=True,
                     n_jobs=1)


# Train the LogisticRegression Classifier

gsLR = gsLR.fit(X_train, y_train)

# Print the training score of the best model

print('the training score of the best model',gsLR.best_score_)

# Print the model parameters of the best model

print('the model parameters of the best model',gsLR.best_params_)

# Print the test score of the best model

clfLR = gsLR.best_estimator_
print('Test accuracy: %.3f' % clfLR.score(X_test, y_test))

the training score of the best model 0.9
the model parameters of the best model {'logisticregression__C': 0.1}
Test accuracy: 0.867


**Grid Search and Support Vector Classifier (SVC)**

In [25]:
pipelineSVC = make_pipeline(StandardScaler(), SVC(random_state=1))

# Create the parameter grid

param_grid_svc = [{
                    'svc__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
                    'svc__kernel': ['linear']
                  },
                 {
                    'svc__C': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
                    'svc__gamma': [0.001, 0.01, 0.05, 0.1, 0.5, 1.0, 10.0],
                    'svc__kernel': ['rbf']
                 }]

# Create an instance of GridSearch Cross-validation estimator

gsSVC = GridSearchCV(estimator=pipelineSVC,
                     param_grid = param_grid_svc,
                     scoring='accuracy',
                     cv=10,
                     refit=True,
                     n_jobs=1)

# Train the SVM classifier

gsSVC.fit(X_train, y_train)

# Print the training score of the best model

print('the training score of the best model',gsSVC.best_score_)

# Print the model parameters of the best model

print('the model parameters of the best model',gsSVC.best_params_)

# Print the model score on the test data using GridSearchCV score method

print('Test accuracy: %.3f' % gsSVC.score(X_test, y_test))



the training score of the best model 0.9285714285714286
the model parameters of the best model {'svc__C': 10.0, 'svc__gamma': 0.05, 'svc__kernel': 'rbf'}
Test accuracy: 0.900
