In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Ins

In [None]:
import pandas as pd

# Lire le fichier CSV pour créer le DataFrame
csv_file_path = "/content/drive/MyDrive/tweet-sentiment-extraction.csv"
df = pd.read_csv(csv_file_path)

# Afficher les colonnes du DataFrame
print(df.columns)

Index(['textID', 'text', 'selected_text', 'sentiment'], dtype='object')


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) y
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the '

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Charger les données à partir du fichier CSV
data = pd.read_csv('/content/drive/MyDrive/tweet-sentiment-extraction.csv')

# Extrait les textes et les étiquettes de sentiment
texts = data['text'].tolist()
sentiments = data['sentiment'].tolist()

# Diviser les données en ensembles d'entraînement et de test
texts_train, texts_test, sentiments_train, sentiments_test = train_test_split(texts, sentiments, test_size=0.2, random_state=42)

# Charger un modèle BERT pré-entraîné et un tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes : positive, negative, neutral

# Définir un dataset personnalisé
class CustomDataset(Dataset):
    def __init__(self, texts, sentiments, tokenizer, max_length=128):
        self.texts = texts
        self.sentiments = sentiments
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.sentiments[idx]
        # Map 'positive', 'negative', 'neutral' to numerical labels
        label_map = {'positive': 0, 'negative': 1, 'neutral': 2}
        label_id = label_map[label]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].view(-1),
            'attention_mask': encoding['attention_mask'].view(-1),
            'labels': torch.tensor(label_id, dtype=torch.long)
        }

# Créer les datasets et les dataloaders
train_dataset = CustomDataset(texts_train, sentiments_train, tokenizer)
test_dataset = CustomDataset(texts_test, sentiments_test, tokenizer)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Configuration de l'entraînement
num_epochs = 3
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs)

# Fonction d'entraînement
def train(model, train_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Fonction d'évaluation
def evaluate(model, test_loader, device):
    model.eval()
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_predictions)
    return accuracy

# Entraînement du modèle
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
for epoch in range(num_epochs):
    loss_train = train(model, train_loader, optimizer, scheduler, device)
    accuracy_test = evaluate(model, test_loader, device)
    print(f"Epoch {epoch + 1}/{num_epochs}: Train Loss {loss_train:.4f}, Test Accuracy {accuracy_test:.4f}")

# Enregistrement du modèle
if not os.path.exists("tweet_sentiment_model"):
    os.makedirs("tweet_sentiment_model")
model.save_pretrained("tweet_sentiment_model")
tokenizer.save_pretrained("tweet_sentiment_model")
#  nom du modèle sur le Hub
model_name_on_hub = "Fatimata/tweet_sentiment_model"

# Pousser le modèle sur Hugging Face Model Hub
model.push_to_hub(model_name_on_hub, use_temp_dir=True)  # Utilisez use_temp_dir=True pour éviter les problèmes de sauvegarde en cas de mémoire insuffisante

# pousser le tokenizer
tokenizer.push_to_hub(model_name_on_hub)

# Affichez un message de confirmation
print(f"Modèle {model_name_on_hub} a été poussé avec succès sur Hugging Face Model Hub.")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3: Train Loss 0.5971, Test Accuracy 0.7912
Epoch 2/3: Train Loss 0.4180, Test Accuracy 0.7937
Epoch 3/3: Train Loss 0.3144, Test Accuracy 0.7939


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Modèle Fatimata/tweet_sentiment_model a été poussé avec succès sur Hugging Face Model Hub.
