In [1]:
import os
os.environ['TF_GPU_ALLOCATOR']="cuda_malloc_async"

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import nltk


2024-11-05 14:35:00.028767: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730813700.122664  203716 cuda_dnn.cc:8498] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730813700.149481  203716 cuda_blas.cc:1410] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-05 14:35:00.375096: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.19.0-dev20241104
Num GPUs Available:  1


In [4]:
# Chargement des données
data = pd.read_csv('data/monthly_best_1000_cleaned.csv', sep=";")

data = data.sample(50000)

# Suppression des lignes avec des valeurs manquantes
data.dropna(subset=['Body', 'Tags'], inplace=True)

# Conversion des tags en listes
data['Tags'] = data['Tags'].apply(lambda x: x.split(','))

# Filtrage des tags rares
from collections import Counter
tag_counts = Counter(tag for tags in data['Tags'] for tag in tags)
min_tag_frequency = 50
frequent_tags = {tag for tag, count in tag_counts.items() if count >= min_tag_frequency}
data['Tags'] = data['Tags'].apply(lambda tags: [tag for tag in tags if tag in frequent_tags])
data = data[data['Tags'].map(len) > 0]

# Binarisation des tags
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['Tags'])

# Séparation en ensembles d'entraînement et de test
X_train_text, X_test_text, y_train, y_test = train_test_split(
    data['Body'], y, test_size=0.2, random_state=42)


In [5]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Fonction de prétraitement
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    return tokens

# Application du prétraitement
X_train_tokens = X_train_text.apply(preprocess_text)
X_test_tokens = X_test_text.apply(preprocess_text)


[nltk_data] Downloading package punkt to /home/clement/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Word2Vec

In [6]:
# Entraînement du modèle Word2Vec
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=300, window=5, min_count=5, workers=4)

# Fonction pour obtenir l'embedding moyen d'un texte
def get_w2v_embedding(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

# Obtention des embeddings pour les ensembles d'entraînement et de test
X_train_w2v = np.array([get_w2v_embedding(tokens) for tokens in X_train_tokens])
X_test_w2v = np.array([get_w2v_embedding(tokens) for tokens in X_test_tokens])


In [10]:
import mlflow
import mlflow.sklearn

# Création du modèle de régression logistique
from sklearn.multiclass import OneVsRestClassifier

# Comme nous avons un problème de classification multiclasse multilabel, nous utilisons OneVsRestClassifier
log_reg_w2v = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))

mlflow.set_tracking_uri('http://0.0.0.0:5000')
experiment = mlflow.set_experiment("Word2Vec")
print("mlflow tracking uri:", mlflow.tracking.get_tracking_uri())


with  mlflow.start_run(experiment_id=experiment.experiment_id):

  # Entraînement du modèle
  log_reg_w2v.fit(X_train_w2v, y_train)

  # Prédictions
  y_pred_w2v = log_reg_w2v.predict(X_test_w2v)

  # Évaluation
  f1_w2v = f1_score(y_test, y_pred_w2v, average='micro')
  print(f"Score F1 (micro) avec Word2Vec : {f1_w2v:.4f}")
  
  mlflow.log_param("f1 micro", f1_w2v)


2024/11/05 14:59:27 INFO mlflow.tracking.fluent: Experiment with name 'Word2Vec' does not exist. Creating a new experiment.


mlflow tracking uri: http://0.0.0.0:5000


2024/11/05 15:00:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run rambunctious-dog-241 at: http://0.0.0.0:5000/#/experiments/837592451353230536/runs/2afca1f7cb184626a05b410c2610922d.
2024/11/05 15:00:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://0.0.0.0:5000/#/experiments/837592451353230536.


Score F1 (micro) avec Word2Vec : 0.4282


## Exemple de prédiction

In [8]:
# Texte d'exemple
example_text = "How can I implement a neural network in Python?"

# Prétraitement
example_tokens = preprocess_text(example_text)

# Obtention de l'embedding
example_embedding = get_w2v_embedding(example_tokens).reshape(1, -1)

# Prédiction des probabilités
y_prob_w2v = log_reg_w2v.predict_proba(example_embedding)[0]

# Obtenir les tags avec les probabilités les plus élevées
top_n = 5  # Nombre de tags à afficher
top_indices = y_prob_w2v.argsort()[-top_n:][::-1]
top_tags = mlb.classes_[top_indices]
top_probs = y_prob_w2v[top_indices]

# Affichage des tags les plus pertinents
print("Texte d'exemple :", example_text)
print("\nTags les plus probables (Word2Vec) :")
for tag, prob in zip(top_tags, top_probs):
    print(f"- {tag}: {prob:.4f}")

Texte d'exemple : How can I implement a neural network in Python?

Tags les plus probables (Word2Vec) :
- python: 0.9999
- tensorflow: 0.9885
- keras: 0.9506
- deep-learning: 0.9247
- kotlin-coroutines: 0.8944


# Universal Sentence Encoder

In [7]:
# Chargement du modèle USE
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Fonction pour obtenir les embeddings USE
def get_use_embeddings(texts):
    return use_model(texts).numpy()

# Obtention des embeddings
X_train_use = get_use_embeddings(X_train_text.tolist())
X_test_use = get_use_embeddings(X_test_text.tolist())


In [8]:
# Création du modèle de régression logistique
log_reg_use = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))

# Entraînement du modèle
log_reg_use.fit(X_train_use, y_train)

# Prédictions
y_pred_use = log_reg_use.predict(X_test_use)

# Évaluation
f1_use = f1_score(y_test, y_pred_use, average='micro')
print(f"Score F1 (micro) avec USE : {f1_use:.4f}")


Score F1 (micro) avec USE : 0.3527


## Exemple de prédiction

In [9]:
# Texte d'exemple
example_text = "How can I implement a neural network in Python?"

# Obtention de l'embedding
example_embedding_use = get_use_embeddings([example_text])

# Prédiction des probabilités
y_prob_use = log_reg_use.predict_proba(example_embedding_use)[0]

# Obtenir les tags avec les probabilités les plus élevées
top_n = 5  # Nombre de tags à afficher
top_indices = y_prob_use.argsort()[-top_n:][::-1]
top_tags = mlb.classes_[top_indices]
top_probs = y_prob_use[top_indices]

# Affichage des tags les plus pertinents
print("\nTags les plus probables (USE) :")
for tag, prob in zip(top_tags, top_probs):
    print(f"- {tag}: {prob:.4f}")


Tags les plus probables (USE) :
- python: 0.9833
- python-3.x: 0.0762
- c++: 0.0335
- github: 0.0319
- amazon-web-services: 0.0257


# BERT

In [4]:
# Chargement du tokenizer et du modèle BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Fonction pour obtenir les embeddings BERT
def get_bert_embeddings(texts):
    inputs = bert_tokenizer(texts, return_tensors='tf', padding=True, truncation=True, max_length=128)
    outputs = bert_model(inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

# Obtention des embeddings
X_train_bert = get_bert_embeddings(X_train_text.to_list())
X_test_bert = get_bert_embeddings(X_test_text.to_list())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT

In [5]:
# Création du modèle de régression logistique
log_reg_bert = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=42))

# Entraînement du modèle
log_reg_bert.fit(X_train_bert, y_train)

# Prédictions
y_pred_bert = log_reg_bert.predict(X_test_bert)

# Évaluation
f1_bert = f1_score(y_test, y_pred_bert, average='micro')
print(f"Score F1 (micro) avec BERT : {f1_bert:.4f}")

Score F1 (micro) avec BERT : 0.4115


## Exemple de prédiction

In [6]:
# Texte d'exemple
example_text = "How can I implement a neural network in Python?"

# Obtention de l'embedding
# Pass the example text as a single-element list
example_embedding_bert = get_bert_embeddings([example_text])

# Prédiction des probabilités
y_prob_bert = log_reg_bert.predict_proba(example_embedding_bert)[0]

# Obtenir les tags avec les probabilités les plus élevées
top_n = 5  # Nombre de tags à afficher
top_indices = y_prob_bert.argsort()[-top_n:][::-1]
top_tags = mlb.classes_[top_indices]
top_probs = y_prob_bert[top_indices]

# Affichage des tags les plus pertinents
print("\nTags les plus probables (BERT) :")
for tag, prob in zip(top_tags, top_probs):
    print(f"- {tag}: {prob:.4f}")


Tags les plus probables (BERT) :
- python: 0.9280
- python-3.x: 0.3438
- c: 0.1322
- docker: 0.0907
- kotlin: 0.0778


# Conclusion

In [13]:
# Affichage des scores F1
print("Scores F1 (micro) :")
print(f"- Word2Vec : {f1_w2v:.4f}")
print(f"- USE      : {f1_use:.4f}")
print(f"- BERT     : {f1_bert:.4f}")

Scores F1 (micro) :
- Word2Vec : 0.1480
- USE      : 0.3527
- BERT     : 0.4115
