In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import gc

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cargar los datos
print("Cargando datos...")
df = pd.read_csv('./data/raw/climateTwitterData.csv')

# Inicializar y ajustar el vectorizador con todos los textos
print("Ajustando el vectorizador...")
vectorizer = TfidfVectorizer(max_features=5000)
all_texts = df['text'].fillna('')  # Asegurarse de que no hay valores nulos
vectorizer.fit(all_texts)

Cargando datos...


  df = pd.read_csv('./data/raw/climateTwitterData.csv')


Ajustando el vectorizador...


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar el modelo
loaded_model = DistilBertForSequenceClassification.from_pretrained('./data/bert_model_sentiment_analysis')

# Mover el modelo cargado al dispositivo adecuado
loaded_model = loaded_model.to(device)

print("Modelo cargado correctamente")

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Función para usar el modelo cargado
def predict_with_loaded_model(texts, batch_size=8):
    loaded_model.eval()
    predictions = []
    
    for i in tqdm(range(0, len(texts), batch_size), desc="Procesando lotes"):
        batch_texts = texts[i:i+batch_size]
        with torch.no_grad():
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = loaded_model(**inputs)
            batch_predictions = torch.argmax(outputs.logits, dim=1)
            predictions.extend(batch_predictions.cpu().numpy())
        
        # Liberar memoria
        del inputs, outputs, batch_predictions
        torch.cuda.empty_cache()
        gc.collect()
    
    return np.array(predictions)

Modelo cargado correctamente


In [12]:
# Predecir sentimientos para tweets sin etiquetas
print("Prediciendo sentimientos para tweets sin etiquetas...")
unlabeled_tweets = df[df['sentiment1'].isna()]
unlabeled_texts = unlabeled_tweets['text'].fillna('').tolist()

# Procesar en lotes más pequeños
batch_size = 8  # Puedes ajustar este valor según sea necesario
predicted_sentiments = predict_with_loaded_model(unlabeled_texts, batch_size)

# Actualizar el DataFrame con las predicciones
df.loc[df['sentiment1'].isna(), 'sentiment1'] = predicted_sentiments

print("Predicciones completadas y DataFrame actualizado.")

# Guardar el DataFrame actualizado
df.to_csv('./data/processed/climateTwitterData_sentiments_updated.csv', index=False)
print("DataFrame actualizado guardado en './data/processed/climateTwitterData_sentiments_updated.csv'")

Prediciendo sentimientos para tweets sin etiquetas...


Procesando lotes: 100%|██████████| 5301/5301 [14:49<00:00,  5.96it/s]


Predicciones completadas y DataFrame actualizado.
DataFrame actualizado guardado en './data/processed/climateTwitterData_sentiments_updated.csv'


In [15]:
display(df['sentiment1'].tail())

72400    0
72401    1
72402    1
72403    1
72404    1
Name: sentiment1, dtype: object

### Ahora ajustaremos el modelo para crear el modelo de clasificacion de topicos

In [5]:
print("Cargando datos con tópicos...")
df_topics = pd.read_csv('./data/processed/tweets_with_topics.csv')

# Preparar los datos
X = df_topics['processed_text']
y = df_topics['dominant_topic']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cargar el tokenizador y el modelo BERT pre-entrenado
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained(
    './data/bert_model_sentiment_analysis', 
    num_labels=len(y.unique())
)
model.to(device)


Cargando datos con tópicos...


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [6]:
# Función para preparar los datos para BERT
def prepare_data_for_bert(texts, labels):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128)
    dataset = TensorDataset(
        torch.tensor(encodings['input_ids']),
        torch.tensor(encodings['attention_mask']),
        torch.tensor(labels.tolist())
    )
    return DataLoader(dataset, batch_size=16, shuffle=True)

# Preparar los datos
train_dataloader = prepare_data_for_bert(X_train, y_train)
test_dataloader = prepare_data_for_bert(X_test, y_test)

In [7]:
# Función de entrenamiento
def train_model(model, train_dataloader, epochs=3):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        for batch in tqdm(train_dataloader):
            optimizer.zero_grad()
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

# Entrenar el modelo
print("Entrenando el modelo para clasificación de tópicos...")
train_model(model, train_dataloader)

Entrenando el modelo para clasificación de tópicos...
Epoch 1/3


100%|██████████| 3621/3621 [07:44<00:00,  7.80it/s]


Epoch 2/3


100%|██████████| 3621/3621 [07:52<00:00,  7.66it/s]


Epoch 3/3


100%|██████████| 3621/3621 [08:12<00:00,  7.36it/s]


In [9]:
# Evaluar el modelo
print("Evaluando el modelo...")
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().tolist())
        true_labels.extend(labels.cpu().tolist())

print(classification_report(true_labels, predictions))

Evaluando el modelo...


100%|██████████| 906/906 [00:37<00:00, 23.89it/s]

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      8817
           1       0.98      0.96      0.97      5664

    accuracy                           0.98     14481
   macro avg       0.98      0.97      0.97     14481
weighted avg       0.98      0.98      0.98     14481






In [10]:
# Guardar el modelo
print("Guardando el modelo de clasificación de tópicos...")
model.save_pretrained('./data/bert_topic_classification_model')

Guardando el modelo de clasificación de tópicos...


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# Cargar el tokenizador y el modelo
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('./data/bert_topic_classification_model')
model.to(device)
model.eval()

Usando dispositivo: cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [16]:
# Función para clasificar tweets por tópico
def classify_tweet_topic(model, tokenizer, tweet_text, device):
    inputs = tokenizer(tweet_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1)
    return "Conceptos generales de cambio climático" if prediction.item() == 0 else "Acciones y movimientos climáticos"


# Tópico 0: "Conceptos generales de cambio climático"
# Tópico 1: "Acciones y movimientos climáticos"
# Lista de tweets de ejemplo
tweets_and_classifications = [
    ("Global temperatures have risen by 1.1°C since pre-industrial times, largely due to human activities.",
     "Conceptos generales de cambio climático"),
    ("Join us for the Global Climate Strike this Friday! We demand immediate action from our leaders. #FridaysForFuture",
     "Acciones y movimientos climáticos"),
    ("Melting glaciers and ice sheets are contributing to rising sea levels worldwide.",
     "Conceptos generales de cambio climático"),
    ("Sign our petition to urge the government to transition to 100% renewable energy by 2030. #ClimateAction",
     "Acciones y movimientos climáticos"),
    ("Deforestation in the Amazon rainforest is accelerating climate change and biodiversity loss.",
     "Conceptos generales de cambio climático"),
    ("Students are organizing a sit-in at city hall to protest inaction on climate policies. #YouthClimateStrike",
     "Acciones y movimientos climáticos"),
    ("The greenhouse effect is the natural process that warms the Earth's surface, but human activities are intensifying it.",
     "Conceptos generales de cambio climático"),
    ("Our community is hosting a climate awareness workshop next week. Learn how you can reduce your carbon footprint!",
     "Acciones y movimientos climáticos"),
    ("Coral reefs are dying due to ocean acidification, a direct result of increased CO2 in the atmosphere.",
     "Conceptos generales de cambio climático"),
    ("We're launching a tree-planting initiative to combat urban heat islands and absorb CO2. Volunteers needed!",
     "Acciones y movimientos climáticos")
]

# Clasificar los tweets de ejemplo
print("Evaluación del modelo:")
correct_predictions = 0
for i, (tweet, expected) in enumerate(tweets_and_classifications, 1):
    predicted = classify_tweet_topic(model, tokenizer, tweet, device)
    is_correct = predicted == expected
    correct_predictions += is_correct
    print(f"Tweet {i}:")
    print(f"Texto: '{tweet[:50]}...'")
    print(f"Predicción: {predicted}")
    print(f"Esperado: {expected}")
    print(f"Correcto: {'Sí' if is_correct else 'No'}\n")

accuracy = correct_predictions / len(tweets_and_classifications) * 100
print(f"Precisión del modelo: {accuracy:.2f}%")


Evaluación del modelo:
Tweet 1:
Texto: 'Global temperatures have risen by 1.1°C since pre-...'
Predicción: Conceptos generales de cambio climático
Esperado: Conceptos generales de cambio climático
Correcto: Sí

Tweet 2:
Texto: 'Join us for the Global Climate Strike this Friday!...'
Predicción: Acciones y movimientos climáticos
Esperado: Acciones y movimientos climáticos
Correcto: Sí

Tweet 3:
Texto: 'Melting glaciers and ice sheets are contributing t...'
Predicción: Conceptos generales de cambio climático
Esperado: Conceptos generales de cambio climático
Correcto: Sí

Tweet 4:
Texto: 'Sign our petition to urge the government to transi...'
Predicción: Conceptos generales de cambio climático
Esperado: Acciones y movimientos climáticos
Correcto: No

Tweet 5:
Texto: 'Deforestation in the Amazon rainforest is accelera...'
Predicción: Conceptos generales de cambio climático
Esperado: Conceptos generales de cambio climático
Correcto: Sí

Tweet 6:
Texto: 'Students are organizing a sit-in at ci