In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
## Utilisation de keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
import numpy as np

# segmentation en mot

In [23]:
text = " je veux apprendre le deep learning "
words = text.split()
words

['je', 'veux', 'apprendre', 'le', 'deep', 'learning']

# segmentation en charactère

In [24]:
character = list(text)
print(character)

[' ', 'j', 'e', ' ', 'v', 'e', 'u', 'x', ' ', 'a', 'p', 'p', 'r', 'e', 'n', 'd', 'r', 'e', ' ', 'l', 'e', ' ', 'd', 'e', 'e', 'p', ' ', 'l', 'e', 'a', 'r', 'n', 'i', 'n', 'g', ' ']


# N-Grame de mot

In [25]:
text = ["Je veux apprendre le deep learning"]
vectorizer = CountVectorizer(ngram_range=(2, 2))
X = vectorizer.fit_transform(text)
print("Bi-grams:\n", vectorizer.get_feature_names_out())
print(X.toarray())

Bi-grams:
 ['apprendre le' 'deep learning' 'je veux' 'le deep' 'veux apprendre']
[[1 1 1 1 1]]


In [26]:
text = ["Je veux apprendre le deep learning"]
vectorizer = CountVectorizer(ngram_range=(1, 2))
X = vectorizer.fit_transform(text)
print("Bi-grams:\n", vectorizer.get_feature_names_out())
print(X.toarray())

Bi-grams:
 ['apprendre' 'apprendre le' 'deep' 'deep learning' 'je' 'je veux' 'le'
 'le deep' 'learning' 'veux' 'veux apprendre']
[[1 1 1 1 1 1 1 1 1 1 1]]


In [27]:
text = ["Je veux apprendre le deep learning"]
vectorizer = CountVectorizer(ngram_range=(1, 3))
X = vectorizer.fit_transform(text)
print("Bi-grams:\n", vectorizer.get_feature_names_out())
print(X.toarray())

Bi-grams:
 ['apprendre' 'apprendre le' 'apprendre le deep' 'deep' 'deep learning'
 'je' 'je veux' 'je veux apprendre' 'le' 'le deep' 'le deep learning'
 'learning' 'veux' 'veux apprendre' 'veux apprendre le']
[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]]


# N-Grame de charactère

In [28]:
vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 2))
X = vectorizer.fit_transform(text)
print("Bi-grams de caractères:\n", vectorizer.get_feature_names_out())
print(X.toarray())

Bi-grams de caractères:
 [' a' ' d' ' l' ' v' 'ap' 'ar' 'de' 'dr' 'e ' 'ea' 'ee' 'en' 'ep' 'eu'
 'in' 'je' 'le' 'nd' 'ng' 'ni' 'p ' 'pp' 'pr' 're' 'rn' 'ux' 've' 'x ']
[[1 1 2 1 1 1 1 1 3 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1]]


# One hot encoding

## one hot encoding sklearn

In [29]:
# Texte d'exemple à analyser
text = ["Je veux apprendre le deep learning"]

# Séparation du texte en mots
words = text[0].split()
print("Mots:", words)

# Initialisation de l'encodeur one-hot
encoder = OneHotEncoder(sparse_output=False)

# Transformation des mots en vecteurs one-hot
words_array = np.array(words).reshape(-1, 1)
one_hot_encoded_words = encoder.fit_transform(words_array)
print("One-hot Encoding des Mots:\n", one_hot_encoded_words)

Mots: ['Je', 'veux', 'apprendre', 'le', 'deep', 'learning']
One-hot Encoding des Mots:
 [[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]]


## one hot encoding tensorflow

### avec des mots

In [30]:
# Texte d'exemple à analyser
text = ["Je veux apprendre le deep learning"]

# Initialisation du tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)

# Transformation des mots en séquences d'indices
sequences = tokenizer.texts_to_sequences(text)
print("Sequences:", sequences)

Sequences: [[1, 2, 3, 4, 5, 6]]


In [31]:
# Transformation en one-hot encoding
word_index = tokenizer.word_index
one_hot_encoded_words = to_categorical(sequences)
print("One-hot Encoding des Mots avec Keras:\n", one_hot_encoded_words)
print("Word Index:", word_index)

One-hot Encoding des Mots avec Keras:
 [[[0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 1.]]]
Word Index: {'je': 1, 'veux': 2, 'apprendre': 3, 'le': 4, 'deep': 5, 'learning': 6}


### avec des charactères

In [32]:
# Séparation du texte en caractères
characters = list(text[0])
print("Caractères:", characters)

# Transformation des caractères en vecteurs one-hot
characters_array = np.array(characters).reshape(-1, 1)
one_hot_encoded_characters = encoder.fit_transform(characters_array)
print("One-hot Encoding des Caractères:\n", one_hot_encoded_characters)


Caractères: ['J', 'e', ' ', 'v', 'e', 'u', 'x', ' ', 'a', 'p', 'p', 'r', 'e', 'n', 'd', 'r', 'e', ' ', 'l', 'e', ' ', 'd', 'e', 'e', 'p', ' ', 'l', 'e', 'a', 'r', 'n', 'i', 'n', 'g']
One-hot Encoding des Caractères:
 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0