In [1]:
!pip install scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
import seaborn as sns
from ydata_profiling import ProfileReport


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Récupérer les données
data1 = pd.read_csv("input/earthquake_features.csv")
data2 = pd.read_csv("input/earthquake_y.csv")

# Merge des données
raw_data = pd.merge(data1, data2, on=['building_id'], how='left')[:5000]

# Afficher les 5 premières lignes
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 40 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   building_id                             50000 non-null  int64 
 1   geo_level_1_id                          50000 non-null  int64 
 2   geo_level_2_id                          50000 non-null  int64 
 3   geo_level_3_id                          50000 non-null  int64 
 4   count_floors_pre_eq                     50000 non-null  int64 
 5   age                                     50000 non-null  int64 
 6   area_percentage                         50000 non-null  int64 
 7   height_percentage                       50000 non-null  int64 
 8   land_surface_condition                  50000 non-null  object
 9   foundation_type                         50000 non-null  object
 10  roof_type                               50000 non-null  object
 11  gr

In [4]:

# Supprimer les colonnes inutiles
cols_to_drop = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
                'has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel', 
                'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school', 
                'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 
                'has_secondary_use_use_police', 'has_secondary_use_other', 'legal_ownership_status', 
                'count_families']
data = raw_data.drop(cols_to_drop, axis=1)

# Afficher les 5 premières lignes
data.head(50)

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,...,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,damage_grade
0,802906,2,30,6,5,t,r,n,f,q,...,0,0,0,0,0,0,0,0,0,3
1,28830,2,10,8,7,o,r,n,x,q,...,0,0,0,0,0,0,0,0,0,2
2,94947,2,10,5,5,t,r,n,f,x,...,0,0,0,0,0,0,0,0,0,3
3,590882,2,10,6,5,t,r,n,f,x,...,0,0,0,0,1,1,0,0,0,2
4,201944,3,30,8,9,t,r,n,f,x,...,0,0,0,0,0,0,0,0,0,3
5,333020,2,10,9,5,t,r,n,f,q,...,0,0,0,0,0,0,0,0,0,2
6,728451,2,25,3,4,n,r,n,x,q,...,0,0,0,0,0,0,0,0,0,3
7,475515,2,0,8,6,t,w,q,v,x,...,0,0,0,1,1,0,0,0,0,1
8,441126,2,15,8,6,t,r,q,f,q,...,0,0,0,0,1,0,0,0,0,2
9,989500,1,0,13,4,t,i,n,v,j,...,0,0,0,1,0,0,0,0,0,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 24 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   building_id                             50000 non-null  int64 
 1   count_floors_pre_eq                     50000 non-null  int64 
 2   age                                     50000 non-null  int64 
 3   area_percentage                         50000 non-null  int64 
 4   height_percentage                       50000 non-null  int64 
 5   land_surface_condition                  50000 non-null  object
 6   foundation_type                         50000 non-null  object
 7   roof_type                               50000 non-null  object
 8   ground_floor_type                       50000 non-null  object
 9   other_floor_type                        50000 non-null  object
 10  position                                50000 non-null  object
 11  pl

In [6]:
# Détecter valeurs manquantes
print(data.isnull().sum().sort_values(ascending=False))

building_id                               0
count_floors_pre_eq                       0
has_superstructure_other                  0
has_superstructure_rc_engineered          0
has_superstructure_rc_non_engineered      0
has_superstructure_bamboo                 0
has_superstructure_timber                 0
has_superstructure_cement_mortar_brick    0
has_superstructure_mud_mortar_brick       0
has_superstructure_cement_mortar_stone    0
has_superstructure_stone_flag             0
has_superstructure_mud_mortar_stone       0
has_superstructure_adobe_mud              0
plan_configuration                        0
position                                  0
other_floor_type                          0
ground_floor_type                         0
roof_type                                 0
foundation_type                           0
land_surface_condition                    0
height_percentage                         0
area_percentage                           0
age                             

In [7]:
# convertir les colonnes
data_copy = data.copy()
data_copy.to_csv('output/full.csv', encoding='utf-8')

def convert_cols(data, colums, enums, type=float):
    for x in range(len(colums)):
        data[colums[x]] = data[colums[x]].replace(enums[x]).astype(type)
    return data

data_copy = convert_cols(data_copy, ["land_surface_condition",
                                                  "foundation_type",
                                                    "roof_type",
                                                    "ground_floor_type",
                                                    "other_floor_type",
                                                    "position",
                                                    "plan_configuration",
                                                    ], [
    {
        "n": "1",
        "o": "2",
        "t": "3",

    },
    {
        "h": "1",
        "i": "2",
        "r": "3",
        "u": "4",
        "w": "5"
    },
    {
        "n": "1",
        "q": "2",
        "x": "3",

    },
    {
        "f": "1",
        "m": "2",
        "v": "3",
        "x": "4",
        "z": "5"
    },
    {
        "j": "1",
        "q": "2",
        "s": "3",
        "x": "4"
    },
    {
        "j": "1",
        "o": "2",
        "s": "3",
        "t": "4"
    },
    {
        "a": "1",
        "c": "2",
        "d": "3",
        "f": "4",
        "m": "5",
        "n": "6",
        "o": "7",
        "q": "8",
        "s": "9",
        "u": "10",
    }
], type="int64")


data_copy.head(10)

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,...,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,damage_grade
0,802906,2,30,6,5,3,3,1,1,2,...,0,0,0,0,0,0,0,0,0,3
1,28830,2,10,8,7,2,3,1,4,2,...,0,0,0,0,0,0,0,0,0,2
2,94947,2,10,5,5,3,3,1,1,4,...,0,0,0,0,0,0,0,0,0,3
3,590882,2,10,6,5,3,3,1,1,4,...,0,0,0,0,1,1,0,0,0,2
4,201944,3,30,8,9,3,3,1,1,4,...,0,0,0,0,0,0,0,0,0,3
5,333020,2,10,9,5,3,3,1,1,2,...,0,0,0,0,0,0,0,0,0,2
6,728451,2,25,3,4,1,3,1,4,2,...,0,0,0,0,0,0,0,0,0,3
7,475515,2,0,8,6,3,5,2,3,4,...,0,0,0,1,1,0,0,0,0,1
8,441126,2,15,8,6,3,3,2,1,2,...,0,0,0,0,1,0,0,0,0,2
9,989500,1,0,13,4,3,2,1,3,1,...,0,0,0,1,0,0,0,0,0,1


In [8]:
# Définir y et X
X = data_copy.drop(["damage_grade"], axis=1)
y = data_copy["damage_grade"]

y.describe()


count    50000.000000
mean         2.240360
std          0.612303
min          1.000000
25%          2.000000
50%          2.000000
75%          3.000000
max          3.000000
Name: damage_grade, dtype: float64

In [9]:
# Info sur les données
print(X.shape)
X.info()
X.describe()

(50000, 23)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 49999
Data columns (total 23 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   building_id                             50000 non-null  int64
 1   count_floors_pre_eq                     50000 non-null  int64
 2   age                                     50000 non-null  int64
 3   area_percentage                         50000 non-null  int64
 4   height_percentage                       50000 non-null  int64
 5   land_surface_condition                  50000 non-null  int64
 6   foundation_type                         50000 non-null  int64
 7   roof_type                               50000 non-null  int64
 8   ground_floor_type                       50000 non-null  int64
 9   other_floor_type                        50000 non-null  int64
 10  position                                50000 non-null  int64
 11  pla

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,...,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,525877.6,2.12872,26.4032,8.01296,5.43816,2.69906,3.1195,1.35952,1.48974,2.22658,...,0.76406,0.03424,0.0182,0.0655,0.07582,0.2551,0.08684,0.0413,0.01694,0.01502
std,304224.4,0.727957,72.817382,4.405749,1.942589,0.692578,0.578711,0.595242,1.02483,0.901875,...,0.424589,0.181847,0.133675,0.247409,0.264713,0.435922,0.281603,0.198985,0.129048,0.121633
min,4.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,260832.8,2.0,10.0,5.0,4.0,3.0,3.0,1.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,526556.5,2.0,15.0,7.0,5.0,3.0,3.0,1.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,788735.0,2.0,30.0,9.0,6.0,3.0,3.0,2.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1052934.0,9.0,995.0,100.0,32.0,3.0,5.0,3.0,5.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
# # 0 pour supprimer toutes les lignes contenant des valeurs manquantes
# data_supp_lignes = data_copy.dropna(axis=0)

# print(data_supp_lignes.shape)

# 1 pour supprimer les colonnes contenant des valeurs manquantes

# data_supp_col = data_copy.dropna(axis=1)

# print(data_supp_col.shape)

# Remplacer les valeurs manquantes par la moyenne, le mode ou la médiane
# X = X.fillna(X.mean())


from mlxtend.data import iris_data
from mlxtend.plotting import plot_pca_correlation_graph
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt



In [13]:
# X_norm = X / X.std(axis=0) # Normalizing the feature columns is recommended
# X_tmp = X.values
# feature_names = X_norm.select_dtypes(include='int64').columns


# figure, correlation_matrix = plot_pca_correlation_graph(X_norm, 
#                                                         feature_names,
#                                                         dimensions=(1, 2),
#                                                         figure_axis_size=10)




In [14]:
# pca = PCA(n_components=4)

# principalComponents = pca.fit_transform(X_norm)

# principalDataframe = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2','PC3', 'PC4'])
# principalDataframe

In [15]:
# ## Recueil des informations par axe
# percent_variance = np.round(pca.explained_variance_ratio_* 100, decimals =2)
# percent_variance

In [16]:
# columns = ['PC1', 'PC2', 'PC3', 'PC4']
# plt.bar(x= range(1,5), height=percent_variance, tick_label=columns)
# plt.ylabel('Percentate of Variance Explained')
# plt.xlabel('Principal Component')
# plt.title('PCA Scree Plot')
# plt.show()

In [17]:
X.head()
corr = X.corr()

In [18]:
# graphique des correlations
plt.figure(figsize=(20, 20))

plt.title("Table de corrélation des 10 premières variables", fontsize=30)

sns.set(style="darkgrid", font_scale=1.2)
plt.xlabel("",fontsize=20)
plt.ylabel("",fontsize=20)

sns.heatmap(corr,
            cmap='viridis',
            cbar=True,
            #vmax=1.0, vmin=-1.0,
            linewidths=0.3,
            annot=True,
            #annot_kws={"size": 8}, square=True
           );

In [19]:
X.describe()

Unnamed: 0,building_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,...,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,525877.6,2.12872,26.4032,8.01296,5.43816,2.69906,3.1195,1.35952,1.48974,2.22658,...,0.76406,0.03424,0.0182,0.0655,0.07582,0.2551,0.08684,0.0413,0.01694,0.01502
std,304224.4,0.727957,72.817382,4.405749,1.942589,0.692578,0.578711,0.595242,1.02483,0.901875,...,0.424589,0.181847,0.133675,0.247409,0.264713,0.435922,0.281603,0.198985,0.129048,0.121633
min,4.0,1.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,260832.8,2.0,10.0,5.0,4.0,3.0,3.0,1.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,526556.5,2.0,15.0,7.0,5.0,3.0,3.0,1.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,788735.0,2.0,30.0,9.0,6.0,3.0,3.0,2.0,1.0,2.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,1052934.0,9.0,995.0,100.0,32.0,3.0,5.0,3.0,5.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Split des donnees Train et Test

In [20]:
from sklearn.model_selection import train_test_split

X.shape

(50000, 23)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

print('X_Train set:', X_train.shape)
print('X_Test set:', X_test.shape)
print('yTest set:', y_test.shape)
print('yTrain set:', y_train.shape)

X_Train set: (40000, 23)
X_Test set: (10000, 23)
yTest set: (10000,)
yTrain set: (40000,)


### Visualisation du Train et Test

In [22]:
plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.scatter(X_train["foundation_type"], X_train["age"], c=y_train)
plt.title('Train set')
plt.subplot(122)
plt.scatter(X_test["foundation_type"], X_test["age"],c=y_test)
plt.title('Test set')

Text(0.5, 1.0, 'Test set')

## 1. KNN classification

In [23]:
from sklearn.neighbors import KNeighborsClassifier

# déclaration de l'intention de knn
knn = KNeighborsClassifier(n_neighbors=48)

# Entrainement du modèle


# Calcul de score

# print('train score:', knn.score(X_train, y_train))
# print('test score:', knn.score(X_test, y_test))

knn.fit(X_train, y_train)

In [24]:
# Calcul de score

print('train score:', knn.score(X_train, y_train))
print('test score:', knn.score(X_test, y_test))

train score: 0.6487
test score: 0.4906


### Matrice de confusion

In [25]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report

y_test_predit = knn.predict(X_test)

confusion_matrix(y_test, y_test_predit)

array([[  64,  670,  205],
       [ 306, 4085, 1261],
       [ 206, 2446,  757]])

In [26]:
y_pd = pd.DataFrame({'modalites':y})
y_pd['modalites'].value_counts()


2    28366
3    16826
1     4808
Name: modalites, dtype: int64

In [27]:
ytrain_pd = pd.DataFrame({'modalites':y_train})
ytrain_pd['modalites'].value_counts()

2    22714
3    13417
1     3869
Name: modalites, dtype: int64

In [28]:
ytest_pd = pd.DataFrame({'modalites':y_test})
ytest_pd['modalites'].value_counts()

2    5652
3    3409
1     939
Name: modalites, dtype: int64

In [29]:
y_test_predit_pd = pd.DataFrame({'modalites':y_test_predit})
y_test_predit_pd['modalites'].value_counts()

2    7201
3    2223
1     576
Name: modalites, dtype: int64

In [30]:
# accuracy : vrai positif + vrai negatif/total

In [31]:
# calcul de la précision, du rappel et du F-score
print(classification_report(y_test, knn.predict(X_test)))

              precision    recall  f1-score   support

           1       0.11      0.07      0.08       939
           2       0.57      0.72      0.64      5652
           3       0.34      0.22      0.27      3409

    accuracy                           0.49     10000
   macro avg       0.34      0.34      0.33     10000
weighted avg       0.45      0.49      0.46     10000



### Cross Validation

In [32]:
from sklearn.model_selection import cross_val_score

#### Validation croisée

In [33]:
cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')


array([0.48725 , 0.483625, 0.49    , 0.4925  , 0.494875])

In [34]:
cross_val_score(knn, X_test, y_test, cv=5, scoring='accuracy')

array([0.516 , 0.4895, 0.4815, 0.4875, 0.477 ])

In [35]:
cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy').mean()

0.48965000000000003

In [36]:
cross_val_score(knn, X_test, y_test, cv=5, scoring='accuracy').mean()

0.49030000000000007

In [37]:
# => Le score moyen sur les 5 échantillons est de ??

## Courbe d'apprentissage

In [38]:
from sklearn.model_selection import learning_curve

In [39]:
N, train_score, val_score = learning_curve(knn, X_train, y_train, train_sizes=np.linspace(0.1, 1, 10), cv=5)

In [40]:
np.linspace(0.1, 1, 10)

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [41]:
N

array([ 3200,  6400,  9600, 12800, 16000, 19200, 22400, 25600, 28800,
       32000])

In [42]:
val_score

array([[0.499875, 0.491375, 0.4925  , 0.49325 , 0.496   ],
       [0.486125, 0.489875, 0.489875, 0.4975  , 0.49375 ],
       [0.4855  , 0.4835  , 0.484125, 0.485125, 0.49325 ],
       [0.488875, 0.490375, 0.49225 , 0.487125, 0.501375],
       [0.48325 , 0.49025 , 0.48825 , 0.49375 , 0.495625],
       [0.48525 , 0.4895  , 0.483375, 0.493375, 0.495125],
       [0.479125, 0.49025 , 0.4875  , 0.4915  , 0.497875],
       [0.4865  , 0.492375, 0.487   , 0.49275 , 0.491   ],
       [0.489875, 0.487625, 0.4875  , 0.495   , 0.493625],
       [0.48725 , 0.483625, 0.49    , 0.4925  , 0.494875]])

In [43]:
train_score

array([[0.641875  , 0.66      , 0.66      , 0.66      , 0.66      ],
       [0.641875  , 0.644375  , 0.644375  , 0.644375  , 0.644375  ],
       [0.63739583, 0.64489583, 0.64791667, 0.64791667, 0.64791667],
       [0.64046875, 0.64640625, 0.64757813, 0.64757813, 0.64757813],
       [0.6453125 , 0.64825   , 0.644125  , 0.641875  , 0.641875  ],
       [0.6471875 , 0.64645833, 0.64505208, 0.644375  , 0.644375  ],
       [0.64995536, 0.64553571, 0.64553571, 0.64410714, 0.64410714],
       [0.64859375, 0.64609375, 0.6453125 , 0.64613281, 0.64535156],
       [0.64871528, 0.64586806, 0.64746528, 0.64753472, 0.646875  ],
       [0.6476875 , 0.64953125, 0.64834375, 0.6475625 , 0.64609375]])

In [44]:
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, val_score.mean(axis=1), label='test')
plt.xlabel('train_sizes')
plt.legend()

<matplotlib.legend.Legend at 0x2b8719f90>

### Determiner le K optimal

In [45]:
val_score = []
for k in range(1, 50):
    score = cross_val_score(KNeighborsClassifier(k), X_train, y_train, cv=5).mean()
    val_score.append(score)

plt.plot(val_score)

# Trouver le K optimal
best_k = np.argmax(val_score) + 1
print("Le K optimal est :", best_k)

Le K optimal est : 48


## 2. Forêt aléatoire classification

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
rf = RandomForestClassifier(random_state=0)

In [48]:
# Entrainement du modèle
rf.fit(X_train, y_train)

In [49]:
# Calcul de score

print('train score:', rf.score(X_train, y_train))
print('test score:', rf.score(X_test, y_test))

train score: 0.999975
test score: 0.576


In [50]:
y_test_rf_predit = rf.predict(X_test)

confusion_matrix(y_test, y_test_rf_predit)

array([[ 328,  536,   75],
       [ 268, 4137, 1247],
       [  55, 2059, 1295]])

In [51]:
# calcul de la précision, du rappel et du F-score
print(classification_report(y_test, y_test_rf_predit))

              precision    recall  f1-score   support

           1       0.50      0.35      0.41       939
           2       0.61      0.73      0.67      5652
           3       0.49      0.38      0.43      3409

    accuracy                           0.58     10000
   macro avg       0.54      0.49      0.50     10000
weighted avg       0.56      0.58      0.56     10000



In [52]:
N_rf, train_score_rf, val_score_rf = learning_curve(rf, X_train, y_train, 
                                                    train_sizes=np.linspace(0.1, 1, 10),
                                                    cv=5, scoring='accuracy')

In [53]:
plt.plot(N_rf, train_score_rf.mean(axis=1), label='train_rf')
plt.plot(N_rf, val_score_rf.mean(axis=1), label='validation_rf')
plt.xlabel('train_sizes')
plt.legend()

<matplotlib.legend.Legend at 0x2bb100f50>

In [54]:
cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')

array([0.567  , 0.56   , 0.57675, 0.568  , 0.5565 ])

In [55]:
cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy').mean()

0.56565

In [56]:
cross_val_score(rf, X_test, y_test, cv=5, scoring='accuracy').mean()

0.5621

### Quelles sont les variables importantes dans la création de la forêt aléatoire

In [57]:
pd.DataFrame(rf.feature_importances_).plot.bar(figsize=(12, 8))

<AxesSubplot: >

## 3. XGBOOST

In [None]:
import xgboost as xgb

xgbt = xgb.XGBClassifier()
xgbt.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, xgbt.predict(X_test)))
print(confusion_matrix(y_train, xgbt.predict(X_train)))

In [None]:
cross_val_score(xgbt, X_train, y_train, cv=5, scoring='accuracy').mean()
cross_val_score(xgbt, X_test, y_test, cv=5, scoring='accuracy').mean()

# Prédiction

In [65]:
# Création de la fonction
def appli(model, Pregnancies = 2, Glucose = 100, BloodPressure = 60, 
          SkinThickness = 30, Insulin = 0,
                BMI = 25, DiabetesPedigreeFunction = 0.5, Age = 40):
  x = np.array([Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin,
                BMI, DiabetesPedigreeFunction, Age]).reshape(1, 8)
  print(model.predict(x))
  print(model.predict_proba(x))

In [66]:
appli(dt,Pregnancies = 1, Glucose = 120, BloodPressure = 70, 
          SkinThickness = 10, Insulin = 2,
                BMI = 35, DiabetesPedigreeFunction = 0.2, Age = 25)



ValueError: X has 8 features, but DecisionTreeRegressor is expecting 23 features as input.

## Application aux différents modèles

In [None]:
appli(rf)

In [None]:
appli(knn)

In [None]:
appli(linreg)

In [None]:
appli(rfg)

In [None]:
appli(dt)

## Score d'appétence 

In [None]:
rf.predict_proba(X_test)

In [None]:
knn.predict_proba(X_test)

In [None]:
linreg.predict_proba(X_test)

In [None]:
rfg.predict_proba(X_test)

In [None]:
dt.predict_proba(X_test)