In [None]:
# Hate is tagged with 1 and NO-hate with 0

In [None]:
import zipfile
import os

# Paths of the directory
PROJECT_ROOT_DIR = os.getcwd()
PROJECT_ROOT_DIR = os.path.join(PROJECT_ROOT_DIR, "proyecto")
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, "datasets")

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Where to unzip data
os.makedirs(DATA_PATH, exist_ok=True)

def unzip_data(zipfile_name, directory_to_extract_to = DATA_PATH):

    path_to_zip_file = os.path.join(PROJECT_ROOT_DIR, zipfile_name)

    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to)

unzip_data(os.path.join(PROJECT_ROOT_DIR, "pan21-author-profiling-test-2021-04-12.zip"))
unzip_data(os.path.join(PROJECT_ROOT_DIR, "pan21-author-profiling-training-2021-03-14.zip"))

In [None]:
# Data processing

import pandas as pd
import xml.etree.ElementTree as ET

L = [{'user' : [], 'tweets' : [], 'language' : [], 'class' : []}, {'user' : [], 'tweets' : [], 'language' : [], 'class' : []}]    # A list of two components with dictionaries in each of them with which we will create the dataframes, in the first component the training set and in the second one the test set
paths_list = [DATA_PATH]    # A list with the directories we will iterate, the last component is the current directory and the previous ones the ones we have already iterated to reach the current one

for dir1 in os.listdir(paths_list[0]):    # os.listdir(path) returns a list of the names of the files inside the given path
    if dir1 == 'pan21-author-profiling-training-2021-03-14':
        set_type = 0    # The training set will be in the first component
    elif dir1 == 'pan21-author-profiling-test-2021-04-12':
        set_type = 1    # The test set will be in the second component
    else:    # In order to skip the file '.ipynb_checkpoints' created by Jupyter
        continue
    paths_list.append(os.path.join(paths_list[-1], dir1))
    for dir2 in os.listdir(paths_list[-1]):
        if dir2 == "en":
            lng = 'english'
        else:
            lng = 'spanish'
        paths_list.append(os.path.join(paths_list[-1], dir2))
        for user in os.listdir(paths_list[-1]):
            if user == '.ipynb_checkpoints' or user == 'truth.txt':
                continue
            tree = ET.parse(os.path.join(paths_list[-1], user))
            root = tree.getroot()

            tweets = []

            cls = int(root.attrib['class'])    # The root attributes are a dictionary and the value of the key 'class' is a string with a '0' (no hate speech) or a '1' (yes hate speech)

            for documents in root:
                for document in documents:
                    tweets.append(document.text)
            L[set_type]['user'].append(user)
            L[set_type]['tweets'].append(tweets)
            L[set_type]['language'].append(lng)
            L[set_type]['class'].append(cls)
        del paths_list[-1]
    del paths_list[-1]

training_set = pd.DataFrame(L[0])
test_set = pd.DataFrame(L[1])

training_set.to_csv(os.path.join(DATA_PATH, "training_set.csv"), index=False)
test_set.to_csv(os.path.join(DATA_PATH, "test_set.csv"), index=False)

In [None]:
training_set.head()

Unnamed: 0,user,tweets,language,class
0,d2e0f4f0244b9b8b3bbd8b1654be5b74.xml,[I’m just being me under construction #HASHTAG...,english,0
1,f30abea44b6c4144c0690f98459428a6.xml,"[i was locked up sending you roses, acabei de ...",english,0
2,d28b60028cf7bcf8a9f145193e261ecf.xml,[This crippled goob saw the “anti-Semitic plat...,english,1
3,78537787441ed0d11da43122f9b0520a.xml,[RT #USER#: EXCLUSIVE: 99% chance COVID-19 was...,english,1
4,30be488aa93e8000aadb952a9cd5143c.xml,"[#USER# Yes, yes she did, RT #USER#: What are ...",english,0


In [None]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user      400 non-null    object
 1   tweets    400 non-null    object
 2   language  400 non-null    object
 3   class     400 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 12.6+ KB


In [None]:
training_set['tweets']

0      [I’m just being me under construction #HASHTAG...
1      [i was locked up sending you roses, acabei de ...
2      [This crippled goob saw the “anti-Semitic plat...
3      [RT #USER#: EXCLUSIVE: 99% chance COVID-19 was...
4      [#USER# Yes, yes she did, RT #USER#: What are ...
                             ...                        
395    [#USER# De todas formas tu consejo y comentari...
396    [Pero el bar de pueblo donde van a jugar 10 vi...
397    [#USER# menudo repaso tu, #USER# #USER# yo tam...
398    [Auxilio, rasa. Me crucé por la marcha feminis...
399    [Trabajo de hoy 💗 #URL#, Los chavos de cares e...
Name: tweets, Length: 400, dtype: object

In [None]:
training_set['tweets'][0]

['I’m just being me under construction #HASHTAG# on the way #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person followed me and one person unfollowed me // automatically checked by #URL#',
 'My birthday coming up and shit it’s lit',
 'one person unfollowed me // automatically checked by #URL#',
 'I don’t post on here so fuck it #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person unfollowed me // automatically checked by #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person unfollowed me // automatically checked by #URL#',
 'one person followed me // automatically checked by #URL#',
 'one person follo

In [None]:
import numpy as np
import datetime
import torch
import random
# import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

plt.style.use('fivethirtyeight')

class torch_model(object):
    def __init__(self, model, loss_fn, optimizer):
        # Here we define the attributes of our class

        # We start by storing the arguments as attributes
        # to use them later
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # Let's send the model to the specified device right away
        self.model.to(self.device)

        # These attributes are defined here, but since they are
        # not informed at the moment of creation, we keep them None
        self.train_loader = None
        self.val_loader = None
        self.writer = None

        # These attributes are going to be computed internally
        self.losses = []
        self.val_losses = []
        self.total_epochs = 0

        # Creates the train_step function for our model,
        # loss function and optimizer
        # Note: there are NO ARGS there! It makes use of the class
        # attributes directly
        self.train_step_fn = self._make_train_step_fn()
        # Creates the val_step function for our model and loss
        self.val_step_fn = self._make_val_step_fn()

    def to(self, device):
        # This method allows the user to specify a different device
        # It sets the corresponding attribute (to be used later in
        # the mini-batches) and sends the model to the device
        try:
            self.device = device
            self.model.to(self.device)
        except RuntimeError:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
            print(f"Couldn't send it to {device}, sending it to {self.device} instead.")
            self.model.to(self.device)

    def set_loaders(self, train_loader, val_loader=None):
        # This method allows the user to define which train_loader (and val_loader, optionally) to use
        # Both loaders are then assigned to attributes of the class
        # So they can be referred to later
        self.train_loader = train_loader
        self.val_loader = val_loader

    def set_tensorboard(self, name, folder='runs'):
        # This method allows the user to define a SummaryWriter to interface with TensorBoard
        suffix = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
        self.writer = SummaryWriter(f'{folder}/{name}_{suffix}')

    def _make_train_step_fn(self):
        # This method does not need ARGS... it can refer to
        # the attributes: self.model, self.loss_fn and self.optimizer

        # Builds function that performs a step in the train loop
        def perform_train_step_fn(x, y):
            # Sets model to TRAIN mode
            self.model.train()

            # Step 1 - Computes our model's predicted output - forward pass
            yhat = self.model(x)
            # Step 2 - Computes the loss
            loss = self.loss_fn(yhat, y)
            # Step 3 - Computes gradients for both "a" and "b" parameters
            loss.backward()
            # Step 4 - Updates parameters using gradients and the learning rate
            self.optimizer.step()
            self.optimizer.zero_grad()

            # Returns the loss
            return loss.item()

        # Returns the function that will be called inside the train loop
        return perform_train_step_fn

    def _make_val_step_fn(self):
        # Builds function that performs a step in the validation loop
        def perform_val_step_fn(x, y):
            # Sets model to EVAL mode
            self.model.eval()

            # Step 1 - Computes our model's predicted output - forward pass
            yhat = self.model(x)
            # Step 2 - Computes the loss
            loss = self.loss_fn(yhat, y)
            # There is no need to compute Steps 3 and 4, since we don't update parameters during evaluation
            return loss.item()

        return perform_val_step_fn

    def _mini_batch(self, validation=False):
        # The mini-batch can be used with both loaders
        # The argument `validation`defines which loader and
        # corresponding step function is going to be used
        if validation:
            data_loader = self.val_loader
            step_fn = self.val_step_fn
        else:
            data_loader = self.train_loader
            step_fn = self.train_step_fn

        if data_loader is None:
            return None

        # Once the data loader and step function, this is the same
        # mini-batch loop we had before
        mini_batch_losses = []
        for x_batch, y_batch in data_loader:
            x_batch = x_batch.to(self.device)
            y_batch = y_batch.to(self.device)

            mini_batch_loss = step_fn(x_batch, y_batch)
            mini_batch_losses.append(mini_batch_loss)

        loss = np.mean(mini_batch_losses)
        return loss

    def set_seed(self, seed=42):
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        try:
            self.train_loader.sampler.generator.manual_seed(seed)
        except AttributeError:
            pass

    def train(self, n_epochs, seed=42):
        # To ensure reproducibility of the training process
        self.set_seed(seed)

        for epoch in range(n_epochs):
            # Keeps track of the numbers of epochs
            # by updating the corresponding attribute
            self.total_epochs += 1

            # inner loop
            # Performs training using mini-batches
            loss = self._mini_batch(validation=False)
            self.losses.append(loss)

            # VALIDATION
            # no gradients in validation!
            with torch.no_grad():
                # Performs evaluation using mini-batches
                val_loss = self._mini_batch(validation=True)
                self.val_losses.append(val_loss)

            # If a SummaryWriter has been set...
            if self.writer:
                scalars = {'training': loss}
                if val_loss is not None:
                    scalars.update({'validation': val_loss})
                # Records both losses for each epoch under the main tag "loss"
                self.writer.add_scalars(main_tag='loss',
                                        tag_scalar_dict=scalars,
                                        global_step=epoch)

        if self.writer:
            # Closes the writer
            self.writer.close()

    def save_checkpoint(self, filename):
        # Builds dictionary with all elements for resuming training
        checkpoint = {'epoch': self.total_epochs,
                      'model_state_dict': self.model.state_dict(),
                      'optimizer_state_dict': self.optimizer.state_dict(),
                      'loss': self.losses,
                      'val_loss': self.val_losses}

        torch.save(checkpoint, filename)

    def load_checkpoint(self, filename):
        # Loads dictionary
        checkpoint = torch.load(filename, weights_only=False)

        # Restore state for model and optimizer
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        self.total_epochs = checkpoint['epoch']
        self.losses = checkpoint['loss']
        self.val_losses = checkpoint['val_loss']

        self.model.train() # always use TRAIN for resuming training

    def predict(self, x):
        # Set is to evaluation mode for predictions
        self.model.eval()
        # Takes aNumpy input and make it a float tensor
        x_tensor = torch.as_tensor(x).float()
        # Send input to device and uses model for prediction
        y_hat_tensor = self.model(x_tensor.to(self.device))
        # Set it back to train mode
        self.model.train()
        # Detaches it, brings it to CPU and back to Numpy
        return y_hat_tensor.detach().cpu().numpy()

    def plot_losses(self):
        fig = plt.figure(figsize=(10, 4))
        plt.plot(self.losses, label='Training Loss', c='b')
        plt.plot(self.val_losses, label='Validation Loss', c='r')
        plt.yscale('log')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.tight_layout()
        return fig

    def add_graph(self):
        # Fetches a single mini-batch so we can use add_graph
        if self.train_loader and self.writer:
            x_sample, y_sample = next(iter(self.train_loader))
            self.writer.add_graph(self.model, x_sample.to(self.device))

    def count_parameters(self):
        return sum(p.numel() for p in self.model.parameters())

    def count_trainable_parameters(self):
        return sum(p.numel() for p in self.model.parameters() if p.requires_grad)

2025-05-21 10:46:01.535250: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
import copy
prueba = [[1,2,3],[4,5,6]]
copia_prueba = [[],[]]
copia_prueba[0].append(prueba[0][0])

In [None]:
print(prueba)
print(copia_prueba)
prueba[0][0] = 'xd'
print(prueba)
print(copia_prueba)

[[1, 2, 3], [4, 5, 6]]
[[1], []]
[['xd', 2, 3], [4, 5, 6]]
[[1], []]


In [None]:
len(training_set['lc_text'])

400

In [None]:
#print(tokenizer.tokenize_sents(training_set['lc_text'][0]))
#filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

#training_set.head()

import

Unnamed: 0,user,tweets,language,class,lc_text,TBW_tok_text,TBW_tok_cln_text
0,d2e0f4f0244b9b8b3bbd8b1654be5b74.xml,[I’m just being me under construction #HASHTAG...,english,0,[i’m just being me under construction #hashtag...,"[[i’m, just, being, me, under, construction, #...","[[i’m, construction, #, hashtag, #, way, #, ur..."
1,f30abea44b6c4144c0690f98459428a6.xml,"[i was locked up sending you roses, acabei de ...",english,0,"[i was locked up sending you roses, acabei de ...","[[i, was, locked, up, sending, you, roses], [a...","[[locked, sending, roses], [acabei, de, ver, g..."
2,d28b60028cf7bcf8a9f145193e261ecf.xml,[This crippled goob saw the “anti-Semitic plat...,english,1,[this crippled goob saw the “anti-semitic plat...,"[[this, crippled, goob, saw, the, “anti-semiti...","[[crippled, goob, saw, “anti-semitic, platform..."
3,78537787441ed0d11da43122f9b0520a.xml,[RT #USER#: EXCLUSIVE: 99% chance COVID-19 was...,english,1,[rt #user#: exclusive: 99% chance covid-19 was...,"[[rt, #, user, #, :, exclusive, :, 99, %, chan...","[[rt, #, user, #, :, exclusive, :, 99, %, chan..."
4,30be488aa93e8000aadb952a9cd5143c.xml,"[#USER# Yes, yes she did, RT #USER#: What are ...",english,0,"[#user# yes, yes she did, rt #user#: what are ...","[[#, user, #, yes, ,, yes, she, did], [rt, #, ...","[[#, user, #, yes, ,, yes], [rt, #, user, #, :..."


In [None]:
# Tokenize text

import nltk

from nltk.tokenize import TreebankWordTokenizer, word_tokenize, sent_tokenize, TweetTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

TBW_tokenizer = TreebankWordTokenizer()
tweet_tokenizer = TweetTokenizer()

stop_words = set(stopwords.words('english'))

#filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
#print(tokenizer.tokenize_sents(training_set['tweets'][0]))

#aux_list = list()
num_users = training_set.shape[0]
num_tweets_eahc_user = [ len(training_set['tweets'][i]) for i in range(num_users) ]

"""
for i in range(num_users):
    aux_list.append([])
    for j in range(num_tweets_eahc_user[i]):
        aux_list[i].append(None)
"""

training_set['lc_text'] = [ [] for i in range(num_users) ]
training_set['TBW_tok_text'] = [ [] for i in range(num_users) ]
training_set['TBW_tok_cln_text'] = [ [] for i in range(num_users) ]
training_set['tweet_tok_text'] = [ [] for i in range(num_users) ]
training_set['tweet_tok_cln_text'] = [ [] for i in range(num_users) ]
training_set['word_tok_text'] = [ [] for i in range(num_users) ]
training_set['word_tok_cln_text'] = [ [] for i in range(num_users) ]

for i in range(num_users):
    for j in range(num_tweets_eahc_user[i]):
        training_set['lc_text'][i].append(training_set['tweets'][i][j].lower())
        training_set['TBW_tok_text'][i].append(TBW_tokenizer.tokenize(training_set['lc_text'][i][j]))
        training_set['tweet_tok_text'][i].append(tweet_tokenizer.tokenize(training_set['lc_text'][i][j]))
        training_set['word_tok_text'][i].append(word_tokenize(training_set['lc_text'][i][j]))
    for j in range(num_tweets_eahc_user[i]):
        training_set['TBW_tok_cln_text'][i].append([])
        training_set['tweet_tok_cln_text'][i].append([])
        training_set['word_tok_cln_text'][i].append([])
        for token in training_set['TBW_tok_text'][i][j]:
            if not token in stop_words:
                training_set['TBW_tok_cln_text'][i][j].append(token)
                training_set['tweet_tok_cln_text'][i][j].append(token)
                training_set['word_tok_cln_text'][i][j].append(token)

"""
for user in training_set['tweets']:
    user_list = list()
    for tweet in user:
        user_list.append(tweet.lower())
    aux_list.append(user_list)
training_set['lc_text'] = aux_list
training_set['TBW_tok_text'] = copy.deepcopy(training_set['lc_text'])
"""


#print('--------------------------------------------------------')

#tokenizer = TweetTokenizer()

#print(tokenizer.tokenize_sents(training_set['tweets'][0]))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"\nfor user in training_set['tweets']:\n    user_list = list()\n    for tweet in user:\n        user_list.append(tweet.lower())\n    aux_list.append(user_list)\ntraining_set['lc_text'] = aux_list\ntraining_set['TBW_tok_text'] = copy.deepcopy(training_set['lc_text'])\n"

In [None]:
import torch.optim as optim
import torch.nn as nn
import torch.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split

model = nn.Sequential()

model.add_module(nn.Linear(X.shape[0], 512))
model.add_module(nn.ReLU())
model.add_module(nn.Linear(512, 512))
model.add_module(nn.ReLU())
model.add_module(nn.Linear(512, 1))

loss_fn = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

NameError: name 'X' is not defined

In [None]:
torch.__version__