In [4]:
# Import Tokenizer and TextVectorization from TensorFlow/Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.sequence import pad_sequences  

In [2]:
# Fit the Tokenizer on phrases
phrases = ["i love my dog", "i love my cat", "you love my dog!" , "Do you think my dog is amazing?"]
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(phrases)

# Display the word index
print("Word Index:", tokenizer.word_index)
print("\nSequences:")
sequences = tokenizer.texts_to_sequences(phrases)
for phrase, seq in zip(phrases, sequences):
    print(f"{phrase} -> {seq}")

Word Index: {'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}

Sequences:
i love my dog -> [4, 2, 1, 3]
i love my cat -> [4, 2, 1, 6]
you love my dog! -> [5, 2, 1, 3]
Do you think my dog is amazing? -> [7, 5, 8, 1, 3, 9, 10]


In [7]:
padded=pad_sequences(sequences, maxlen=5)
print("Word Index:", tokenizer.word_index)
print("\nPadded Sequences:")
print(padded)

Word Index: {'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}

Padded Sequences:
[[ 0  4  2  1  3]
 [ 0  4  2  1  6]
 [ 0  5  2  1  3]
 [ 8  1  3  9 10]]


In [9]:
new_phrases = ["my dog loves my computer",]
new_sequences = tokenizer.texts_to_sequences(new_phrases)
print("\nNew Sequences:")
print(new_sequences)


New Sequences:
[[1, 3, 1]]


In [12]:
good_phrases = [
    "J'adore ce produit",
    "Très bon service",
    "Je suis très satisfait",
    "Excellente qualité",
    "Je recommande vivement",
    "C'est parfait",
    "Super expérience",
    "Service client génial",
    "Produit conforme à la description",
    "Très heureux de mon achat",
]

bad_phrases = [
    "Je suis très déçu",
    "Mauvaise qualité",
    "Ça ne marche pas",
    "Service client lamentable",
    "Ne recommande pas",
    "Très mauvaise expérience",
    "Produit cassé",
    "Perte de temps",
    "C'était nul",
    "Arnaque totale",
]

db_sentences = good_phrases + bad_phrases
db_labels = ['bon'] * len(good_phrases) + ['mauvais'] * len(bad_phrases)
#create df from sentences and labels
import pandas as pd
df = pd.DataFrame({'phrase': db_sentences, 'label': db_labels})
print(df.head())


                   phrase label
0      J'adore ce produit   bon
1        Très bon service   bon
2  Je suis très satisfait   bon
3      Excellente qualité   bon
4  Je recommande vivement   bon


In [14]:
# Fit tokenizer on the database phrases
# Add oov token to handle out-of-vocabulary words
tokenizer_db = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer_db.fit_on_texts(db_sentences)

print("Tokenizer fitted on database phrases")
print(f"Total unique words in database: {len(tokenizer_db.word_index)}")
print(f"OOV token index: {tokenizer_db.word_index.get('<OOV>')}")
print(f"\nWord Index (first 20 words):")
sorted_words = sorted(tokenizer_db.word_index.items(), key=lambda x: x[1])[:20]
for word, idx in sorted_words:
    print(f"  {word}: {idx}")

# Convert all phrases to sequences
db_sequences = tokenizer_db.texts_to_sequences(db_sentences)
db_padded = pad_sequences(db_sequences, maxlen=15)

print(f"\nDatabase sequences shape: {db_padded.shape}")
print(f"First 3 padded sequences:\n{db_padded[:3]}")

# create a df for logistic regression model: vecteur et label
# features will be the padded sequences (flattened) and target is numeric label
import numpy as np
X = db_padded.astype('float32')
y = np.array([1 if label == 'bon' else 0 for label in db_labels])

# Create a DataFrame with features for exploration if you like
feature_cols = [f'tok_{i}' for i in range(X.shape[1])]
import pandas as pd
X_df = pd.DataFrame(X, columns=feature_cols)
df_lr = X_df.copy()
df_lr['label'] = y
print(df_lr.head())

Tokenizer fitted on database phrases
Total unique words in database: 42
OOV token index: 1

Word Index (first 20 words):
  <OOV>: 1
  très: 2
  produit: 3
  service: 4
  je: 5
  suis: 6
  qualité: 7
  recommande: 8
  expérience: 9
  client: 10
  de: 11
  mauvaise: 12
  ne: 13
  pas: 14
  j'adore: 15
  ce: 16
  bon: 17
  satisfait: 18
  excellente: 19
  vivement: 20

Database sequences shape: (20, 15)
First 3 padded sequences:
[[ 0  0  0  0  0  0  0  0  0  0  0  0 15 16  3]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  2 17  4]
 [ 0  0  0  0  0  0  0  0  0  0  0  5  6  2 18]]
   tok_0  tok_1  tok_2  tok_3  tok_4  tok_5  tok_6  tok_7  tok_8  tok_9  \
0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4    0.0    0.0    0.0    0.0    0.0    0.0    0.0

In [15]:
# Classify new phrases using the fitted tokenizer
new_test_phrases = [
    "J'adore ce service, excellent!",
    "C'est horrible, très mauvais",
    "Super produit, je suis satisfait",
    "Arnaque, c'est cassé",
    "Très bon, je recommande",
]


In [18]:
# Train and evaluate logistic regression on padded token sequences
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Fit logistic regression
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("\nClassification report:")
print(classification_report(y_test, y_pred, target_names=['mauvais','bon']))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Save model for reuse (optional)
import joblib
joblib.dump(clf, 'logreg_token_model.joblib')

# Use the model for predicting the training set for quick sanity check
train_pred = clf.predict(X_train)
print('\nTrain accuracy:', accuracy_score(y_train, train_pred))

Train shape: (16, 15), Test shape: (4, 15)

Classification report:
              precision    recall  f1-score   support

     mauvais       0.67      1.00      0.80         2
         bon       1.00      0.50      0.67         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4

Accuracy: 0.75
Confusion matrix:
 [[2 0]
 [1 1]]

Train accuracy: 0.625


In [20]:
new_sequences = tokenizer_db.texts_to_sequences(new_test_phrases)
new_padded = pad_sequences(new_sequences, maxlen=15)

preds = clf.predict(new_padded)
probs = clf.predict_proba(new_padded)
label_map = {1: 'bon', 0: 'mauvais'}

print('\nPredictions for new phrases:')
for phrase, seq, padded_seq, pred, prob in zip(new_test_phrases, new_sequences, new_padded, preds, probs):
    print('\nPhrase:', phrase)
    print('Sequence:', seq)
    print('Padded:', padded_seq)
    print('Predicted label:', label_map[pred], f' (prob: {np.round(prob.max(), 3)})')
    print('---')


Predictions for new phrases:

Phrase: J'adore ce service, excellent!
Sequence: [15, 16, 4, 1]
Padded: [ 0  0  0  0  0  0  0  0  0  0  0 15 16  4  1]
Predicted label: bon  (prob: 0.619)
---

Phrase: C'est horrible, très mauvais
Sequence: [21, 1, 2, 1]
Padded: [ 0  0  0  0  0  0  0  0  0  0  0 21  1  2  1]
Predicted label: bon  (prob: 0.699)
---

Phrase: Super produit, je suis satisfait
Sequence: [23, 3, 5, 6, 18]
Padded: [ 0  0  0  0  0  0  0  0  0  0 23  3  5  6 18]
Predicted label: bon  (prob: 1.0)
---

Phrase: Arnaque, c'est cassé
Sequence: [41, 21, 36]
Padded: [ 0  0  0  0  0  0  0  0  0  0  0  0 41 21 36]
Predicted label: mauvais  (prob: 0.893)
---

Phrase: Très bon, je recommande
Sequence: [2, 17, 5, 8]
Padded: [ 0  0  0  0  0  0  0  0  0  0  0  2 17  5  8]
Predicted label: bon  (prob: 0.562)
---
