<a href="https://colab.research.google.com/github/fgia/pythonmachinelearning/blob/jupyter/OneHotEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OneHotEncoding : gestion des catégories dans le machine learning

Comment gérer des caractéristiques qui soient des chaînes de caractères et non des nombres. 

## Principe de base

### Construction du jeux de données

In [1]:
import pandas as pd
colors_df = pd.DataFrame(data=[['red'],['blue'],['green'],['blue']], columns=['color'])
print('Avant le  One Hot Encoding:')
display(colors_df)

Avant le  One Hot Encoding:


Unnamed: 0,color
0,red
1,blue
2,green
3,blue


### Réalisation sans keras

### Réalisation avec Keras

In [2]:
# tranformation du jeux de données "label" dans un domaine oneHotEncoded
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit(colors_df)

colors_df_encoded = one_hot_encoder.transform(colors_df)
colors_df_encoded = pd.DataFrame(data=colors_df_encoded, columns=one_hot_encoder.categories_)
print('\nAfter One Hot Encoding:')
display(colors_df_encoded)


After One Hot Encoding:


Unnamed: 0,blue,green,red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0


In [3]:
# Transformation inverse du oneHotEncoded
colors_df_decoded = one_hot_encoder.inverse_transform(colors_df_encoded)
display(colors_df_decoded)

array([['red'],
       ['blue'],
       ['green'],
       ['blue']], dtype=object)

### Multiple colonnes

In [4]:
multiple_colonnes_df = pd.DataFrame(data=[['red',123.2],['blue',1200.0],['green',1475.2],['blue',1452]], columns=['color','valeur'])
print('Multiple colonnes:')
display(multiple_colonnes_df)

Multiple colonnes:


Unnamed: 0,color,valeur
0,red,123.2
1,blue,1200.0
2,green,1475.2
3,blue,1452.0


In [5]:
# application du one hot encoding sur ce multicolonnes
import numpy as np
one_hot_encoder_multiple = OneHotEncoder(sparse=False) # sparse à mettre absolument
# transform la colonne en colonnes binaires
multiple_df_encoded = one_hot_encoder_multiple.fit_transform(multiple_colonnes_df[['color']])
display(one_hot_encoder_multiple.categories_)
print("Structure 4 x3 ")
display(multiple_df_encoded)
# récupère les noms des colonnes
column_name = one_hot_encoder_multiple.get_feature_names(['color'])
# construit un dataframe à partir du résultat en ajoutant le nom des colonnes
multiple_df_encoded_1 = pd.DataFrame(data=multiple_df_encoded, columns=column_name)
display(multiple_df_encoded_1)


[array(['blue', 'green', 'red'], dtype=object)]

Structure 4 x3 


array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0


In [6]:
# enleve la précédente colonne couleur qui ne sert plus
multiples_colonnes_df_simple = multiple_colonnes_df.drop('color',1)
# concatène les deux dataframes pour faire un dataframe en entrée
df_final = pd.merge(multiples_colonnes_df_simple,multiple_df_encoded_1, right_index=True, left_index=True)
display(df_final)

Unnamed: 0,valeur,color_blue,color_green,color_red
0,123.2,0.0,0.0,1.0
1,1200.0,1.0,0.0,0.0
2,1475.2,0.0,1.0,0.0
3,1452.0,1.0,0.0,0.0


In [7]:
display(df_final.columns)

Index(['valeur', 'color_blue', 'color_green', 'color_red'], dtype='object')

# OneHotEncoding au niveau d'une phrase

## Découper une phrase

In [8]:
# Construction du dataset
sentences = ['My cat is stupid','The cat is blue','Cat and dogs are friends','Dogs smell']

In [9]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(sentences)
sequences_t = tokenizer.texts_to_sequences(sentences)
word_index = tokenizer.word_index
display(sequences_t)
display(word_index)

[[4, 1, 2, 5], [6, 1, 2, 7], [1, 8, 3, 9, 10], [3, 11]]

{'and': 8,
 'are': 9,
 'blue': 7,
 'cat': 1,
 'dogs': 3,
 'friends': 10,
 'is': 2,
 'my': 4,
 'smell': 11,
 'stupid': 5,
 'the': 6}

In [10]:
# transformation en une matrice composée de zéro et de 1

def vectorize_sequence(asequences, dimension=10000):
  print(asequences)
  # création d'une matrice ne contenant que des zéros
  results = np.zeros((len(asequences),dimension))
  #initialisaiton à 1 des colonnes correspondant aux mots
  for i, seq in enumerate(asequences):
    results[i,seq] = 1
  return results

array_sequence = vectorize_sequence(sequences_t,len(word_index.keys())+1)
print(array_sequence)

[[4, 1, 2, 5], [6, 1, 2, 7], [1, 8, 3, 9, 10], [3, 11]]
[[0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [11]:
x = np.asarray(array_sequence).astype('float32')
print(x)

[[0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0.]
 [0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.]]
