# <center>Partie 1 : Base de données, Analyse, Prétraitement et Préparation</center>

## traitement des fichiers csv

> le traitement initial des fichiers est fait dans le fichier 'import_fichiers'

## traitement de données

### importer les bibliothèques de base, les autres seront importées au fur et à mesure

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### import du fichier csv

In [2]:
data = pd.read_csv('data.csv')

#### premières visualisations du jeu de données

In [3]:
data.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Score,Interpretation
0,a,a,a,a,a,1,1,1,1,1,10,B
1,b,b,b,b,b,2,2,2,2,2,0,C
2,c,c,c,c,c,3,3,3,3,3,20,A
3,a,b,c,a,b,1,2,3,1,2,8,C
4,b,c,a,c,a,3,2,3,1,2,11,B


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Q1              217 non-null    object
 1   Q2              213 non-null    object
 2   Q3              212 non-null    object
 3   Q4              215 non-null    object
 4   Q5              211 non-null    object
 5   Q6              213 non-null    object
 6   Q7              215 non-null    object
 7   Q8              213 non-null    object
 8   Q9              215 non-null    object
 9   Q10             217 non-null    object
 10  Score           225 non-null    int64 
 11  Interpretation  225 non-null    object
dtypes: int64(1), object(11)
memory usage: 21.2+ KB


In [5]:
df = data.copy()

#### Traitement des données non numériques
> remplacement des valeurs non-souhaitées par des 'Nan'

In [6]:
# remplacer les caractères a,b,c par des 1, 2, 3, les autres sont automatiquement convertis en nan 

df = df.drop('Score', axis=1)

caracteres_rempla_abc = {
    'a':0,
    'b':1,
    'c':2,
    'A':0,
    'B':1,
    'C':2,
    '1':1,
    '2':2,
    '3':3
}
for colonne in range(df.shape[1]):
    
    df.iloc[:,colonne] = df.iloc[:,colonne].map(caracteres_rempla_abc)


    

In [7]:
# vérification du type des données
df.dtypes

Q1                float64
Q2                float64
Q3                float64
Q4                float64
Q5                float64
Q6                float64
Q7                float64
Q8                float64
Q9                float64
Q10               float64
Interpretation      int64
dtype: object

vérification

In [8]:
# visualisation du jeu de données pour vérifier la transformation
df.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Interpretation
0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1
1,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2
2,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,0
3,0.0,1.0,2.0,0.0,1.0,1.0,2.0,3.0,1.0,2.0,2
4,1.0,2.0,0.0,2.0,0.0,3.0,2.0,3.0,1.0,2.0,1


##### choix de suppression des NaN, par rapport au remplacement

In [24]:
for i in range(df.shape[1]):
    res = df.iloc[:,i].mode()
    
    df.iloc[:,i]=df.iloc[:,i].fillna(res)
    print(df.iloc[:,i])

0      0.0
1      1.0
2      2.0
3      0.0
4      1.0
      ... 
220    2.0
221    1.0
222    0.0
223    0.0
224    0.0
Name: Q1, Length: 225, dtype: float64
0      0.0
1      1.0
2      2.0
3      1.0
4      2.0
      ... 
220    2.0
221    0.0
222    2.0
223    NaN
224    1.0
Name: Q2, Length: 225, dtype: float64
0      0.0
1      1.0
2      2.0
3      2.0
4      0.0
      ... 
220    NaN
221    2.0
222    1.0
223    NaN
224    2.0
Name: Q3, Length: 225, dtype: float64
0      0.0
1      1.0
2      2.0
3      0.0
4      2.0
      ... 
220    2.0
221    NaN
222    NaN
223    0.0
224    NaN
Name: Q4, Length: 225, dtype: float64
0      0.0
1      1.0
2      2.0
3      1.0
4      0.0
      ... 
220    2.0
221    1.0
222    NaN
223    1.0
224    NaN
Name: Q5, Length: 225, dtype: float64
0      1.0
1      2.0
2      3.0
3      1.0
4      3.0
      ... 
220    NaN
221    NaN
222    NaN
223    NaN
224    NaN
Name: Q6, Length: 225, dtype: float64
0      1.0
1      2.0
2      3.0
3      2.0
4 

In [10]:
df.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Interpretation
0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1
1,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2
2,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,0
3,0.0,1.0,2.0,0.0,1.0,1.0,2.0,3.0,1.0,2.0,2
4,1.0,2.0,0.0,2.0,0.0,3.0,2.0,3.0,1.0,2.0,1


In [11]:
df.shape

(225, 11)

In [23]:
df.isna().sum()

Q1                 29
Q2                 29
Q3                 32
Q4                 27
Q5                 34
Q6                 67
Q7                 96
Q8                110
Q9                 89
Q10                78
Interpretation      0
dtype: int64

#### écartement des colonnes 'score' et 'interprétation' pour créer le jeu de données des features 

In [13]:
X = df.drop(columns=['Interpretation'], axis=1)

#### choix  : prédire le score ; l'interprétation est écartée du jeu de données

In [14]:
y = df['Interpretation']

#### séparation du jeu de données en donnée de test et d'entrainement

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

# <center>KNN from Sklearn</center>

import des bibliothèques

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
from sklearn.model_selection import GridSearchCV

#### création des paramètres pour gridSearchCv

In [18]:
# création d'une liste de différentes valeurs de k
k_range = list(range(1, 15))

# mettre ces paramètres dans un dictionnaire à faire passer dans gridSearchCv
param_grid = dict(n_neighbors=k_range)

# ajout d'une ligne avec différentes valeurs de p dans le dictionnaire
param_grid['p'] = [1, 2, 3]

#### création d'un modèle Knn

In [19]:
clf2 = KNeighborsClassifier()

#### recherche des meilleurs paramètres en fonction du dictionnaire passé

In [20]:
grid = GridSearchCV(clf2, param_grid, cv=5, scoring='accuracy', return_train_score=False,verbose=1)

In [21]:
grid_search = grid.fit(X_train, y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


Traceback (most recent call last):
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py", line 179, in fit
    return self._fit(X, y)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 363, in _fit
    X, y = self._validate_data(X, y, accept_sparse="csr",
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/erwan/anaconda3/lib/py

Traceback (most recent call last):
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py", line 179, in fit
    return self._fit(X, y)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 363, in _fit
    X, y = self._validate_data(X, y, accept_sparse="csr",
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/erwan/anaconda3/lib/py

Traceback (most recent call last):
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py", line 179, in fit
    return self._fit(X, y)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_base.py", line 363, in _fit
    X, y = self._validate_data(X, y, accept_sparse="csr",
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/erwan/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 871, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/erwan/anaconda3/lib/py

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
# voir les meilleurs paramètres
print(grid_search.best_params_)

In [None]:
# visualiser l'accuracy 
accuracy = grid_search.best_score_ 
accuracy

#### création du modèle avec les bons paramètres

In [None]:
clf_def = KNeighborsClassifier(n_neighbors=4, p=1)

In [None]:
clf_def.fit(X_test, y_test)

In [None]:
clf_def.score(X_test, y_test)

# exportation du modèle créé

In [None]:
# from joblib import dump

# dump(clf_def, 'regression_model_saved.joblib')

> <h1>la suite se passe en exécutant le fichier run.py !</h1>