In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import json
import numpy as np
import random
import re
from collections import Counter
import os
import matplotlib.pyplot as plt
from unidecode import unidecode
from sklearn.metrics import precision_recall_fscore_support
from langdetect import detect, DetectorFactory
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedShuffleSplit


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuration de l'authentification Google Cloud
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "credentials.json"

In [3]:
# Télécharger les ressources nécessaires de nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cyrine\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

VISUALISATION DES DONNEES

In [4]:
# Charger les données
with open('final_dataset.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
    

PRETRAITEMENT DU TEXTE

In [6]:
# Chargement du tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')




In [7]:
# Définir la graine pour la reproductibilité pour la detection de langues
DetectorFactory.seed = 0


In [8]:
# Fonction pour détecter la langue
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'en'  # par défaut, l'anglais si la détection échoue
    

In [None]:
import re

def extract_hashtags(captions):
    hashtags = []
    for caption in captions:
        hashtags.extend(re.findall(r'#\w+', caption))
    return ' '.join(hashtags)


In [9]:
# Fonction pour supprimer les stop words en utilisant le tokenizer de BERT
def remove_stopwords(text, language):
    try:
        stop_words = set(stopwords.words(language))
    except:
        stop_words = set(stopwords.words('english'))
    
    # Utiliser le tokenizer de BERT pour diviser le texte en mots
    tokens = tokenizer.basic_tokenizer.tokenize(text)
    
    # Filtrer les stop words
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    return filtered_tokens

In [11]:
# Fonction pour nettoyer les tokens en supprimant les préfixes '##'
def clean_tokenized_text(tokenized_text):
    cleaned_tokens = []
    for token in tokenized_text:
        if token.startswith("##"):
            if len(cleaned_tokens) > 0:
                cleaned_tokens[-1] += token[2:]
        else:
            cleaned_tokens.append(token)
    return ' '.join(cleaned_tokens)

In [12]:
# Nouvelle fonction de prétraitement du texte
def preprocess_text(text):
    # Suppression des URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    # Conversion des caractères spéciaux en caractères ASCII
    text = unidecode(text)
    # Normalisation des caractères en minuscules
    text = text.lower()
    # Extraction des hashtags
    hashtags = re.findall(r'#\w+', text)
    # Suppression des hashtags du texte
    text = re.sub(r'#\w+', '', text)
    # Suppression des caractères non alphabétiques et non numériques
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Détection de la langue
    language = detect_language(text)
    # Suppression des stop words en utilisant le tokenizer de BERT
    filtered_tokens = remove_stopwords(text, language)
    # Nettoyage des tokens
    cleaned_text = clean_tokenized_text(filtered_tokens)
    return cleaned_text, ' '.join(hashtags)


In [16]:
# Préparation des labels
labels = [user['interests'].split(',') for user in data]
print(labels)

[['Arts and music', 'Travel', 'Politics and social issues'], ['Arts and music', 'Live events'], ['Sports', 'Vehicles', 'Photography'], ['Shopping and fashion', ' Travel'], ['Shopping and fashion', 'Sports'], ['Arts and music', 'Politics and social issues', 'Shopping and Fashion'], ['Family and relationships', 'Pets', 'Travel'], ['Sports', 'Photography', 'Travel'], ['Sports', ' Shopping and fashion'], ['Sports', 'Pets', 'Photography'], ['Travel', 'Photography', 'Sports'], ['Family and relationships', 'Acting'], ['Travel', 'Photography'], ['Shopping and fashion', 'Travel'], ['Vehicles', 'Sports'], ['Arts and music', 'Acting'], ['Sports', ' Vehicles', ' Family and relationships'], ['Arts and music', 'Acting'], ['Arts and music', 'Acting'], ['Arts and music', 'Politics and social issues'], ['Beauty', 'Travel'], ['Shopping and Fashion'], ['Sports', 'Family and relationships', 'Politics and social issues'], ['Arts and music', 'Politics and social issues'], ['Food and drink', 'Travel'], ['Art

In [17]:
def clean_labels(label_list):
    return set(label.strip().lower() for label in label_list)
    

In [18]:
cleaned_labels = [clean_labels(label) for label in labels]
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(cleaned_labels)
