# AUTHORS

- **Mathieu Breier**  
  *BSE DSDM 2023-2024*  
  Email: [mathieu.breier@bse.eu](mailto:mathieu.breier@bse.eu)

- **Guillem Mirabent**  
  *BSE DSDM 2023-2024*  
  Email: [guillem.mirabent@bse.eu](mailto:guillem.mirabent@bse.eu)

# INDEX

# LOAD & IMPORTS

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, roc_auc_score, roc_curve, auc, confusion_matrix

from utils_preproc import *

import klib as kl 
import plotly.express as px

import seaborn as sns

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import keras
from keras import ops
from keras import layers

import gensim.downloader as api
from gensim.models import KeyedVectors, Word2Vec
import gensim

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guillemmirabentrubinat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv('../Spanish_Parliament_Speeches.csv')
data = kl.clean_column_names(data)
display(data)

Unnamed: 0,text_id,id,title,date,body,term,session,meeting,sitting,agenda,...,party_status,party_orientation,speaker_id,speaker_name,speaker_gender,speaker_birth,is_in_english,key,speech_text,speech_text_preprocessed
0,ParlaMint-ES_2022-06-29-CD220629,ParlaMint-ES_2022-06-29-CD220629.u3,Sesión plenaria núm. 191 (2022-06-29),2022-06-29,Cámara Baja,Legislatura XIV,Sesión plenaria núm. 191,-,2022-06-29,-,...,Opposition,Centro-derecha - Derecha,JaimeDeOlanoVela,"De Olano Vela, Jaime",M,1970,False,ParlaMint-ES_2022-06-29-CD220629.u3,"Gracias, presidenta. Señora Calviño, los dos ú...",gracias presidenta señora calviño dos último d...
1,ParlaMint-ES_2022-06-29-CD220629,ParlaMint-ES_2022-06-29-CD220629.u5,Sesión plenaria núm. 191 (2022-06-29),2022-06-29,Cámara Baja,Legislatura XIV,Sesión plenaria núm. 191,-,2022-06-29,-,...,Coalition,Centro-izquierda,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",F,1968,False,ParlaMint-ES_2022-06-29-CD220629.u5,"Gracias, presidenta. Voy a responder a la preg...",gracias presidenta ir responder pregunta tener...
2,ParlaMint-ES_2022-06-29-CD220629,ParlaMint-ES_2022-06-29-CD220629.u7,Sesión plenaria núm. 191 (2022-06-29),2022-06-29,Cámara Baja,Legislatura XIV,Sesión plenaria núm. 191,-,2022-06-29,-,...,Opposition,Centro-derecha - Derecha,JaimeDeOlanoVela,"De Olano Vela, Jaime",M,1970,False,ParlaMint-ES_2022-06-29-CD220629.u7,"Señora Calviño, menos mal que funcionan sus me...",señora calviño menos mal funcionar medida infl...
3,ParlaMint-ES_2022-06-29-CD220629,ParlaMint-ES_2022-06-29-CD220629.u9,Sesión plenaria núm. 191 (2022-06-29),2022-06-29,Cámara Baja,Legislatura XIV,Sesión plenaria núm. 191,-,2022-06-29,-,...,Coalition,Centro-izquierda,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",F,1968,False,ParlaMint-ES_2022-06-29-CD220629.u9,"Muchas gracias. Ya es mala suerte, señor Olano...",gracia ser malo suerte señor olano poner usted...
4,ParlaMint-ES_2022-06-29-CD220629,ParlaMint-ES_2022-06-29-CD220629.u12,Sesión plenaria núm. 191 (2022-06-29),2022-06-29,Cámara Baja,Legislatura XIV,Sesión plenaria núm. 191,-,2022-06-29,-,...,Opposition,Derecha - Extrema derecha,IvánEspinosaDeLosMonterosDeSimón,"Espinosa De Los Monteros De Simón, Iván",M,1971,False,ParlaMint-ES_2022-06-29-CD220629.u12,"Gracias, señora presidenta. Señora vicepreside...",gracias señora presidenta señora vicepresident...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32546,ParlaMint-ES_2016-11-15-CD161115,ParlaMint-ES_2016-11-15-CD161115.u120,Sesión plenaria núm. 14 (2016-11-15),2016-11-15,Cámara Baja,Legislatura XII,Sesión plenaria núm. 14,-,2016-11-15,-,...,Opposition,Centro-izquierda - izquierda,GabrielRufiánRomero,"Rufián Romero, Gabriel",M,1982,False,ParlaMint-ES_2016-11-15-CD161115.u120,"Señora presidenta, por alusiones, quería conte...",señora presidentar alusión querer contestar se...
32547,ParlaMint-ES_2016-11-15-CD161115,ParlaMint-ES_2016-11-15-CD161115.u122,Sesión plenaria núm. 14 (2016-11-15),2016-11-15,Cámara Baja,Legislatura XII,Sesión plenaria núm. 14,-,2016-11-15,-,...,Opposition,Centro-izquierda - izquierda,GabrielRufiánRomero,"Rufián Romero, Gabriel",M,1982,False,ParlaMint-ES_2016-11-15-CD161115.u122,Es la falsedad que ha comentado el portavoz de...,ser falsedad haber comentar portavoz grupo soc...
32548,ParlaMint-ES_2016-11-15-CD161115,ParlaMint-ES_2016-11-15-CD161115.u124,Sesión plenaria núm. 14 (2016-11-15),2016-11-15,Cámara Baja,Legislatura XII,Sesión plenaria núm. 14,-,2016-11-15,-,...,Opposition,Centro-izquierda,MiguelÁngelHerediaDíaz,"Heredia Díaz, Miguel Ángel",M,1966,False,ParlaMint-ES_2016-11-15-CD161115.u124,"Señora presidenta, tengo un vídeo donde el señ...",señora presidenta tener vídeo señor rufián cua...
32549,ParlaMint-ES_2016-11-15-CD161115,ParlaMint-ES_2016-11-15-CD161115.u126,Sesión plenaria núm. 14 (2016-11-15),2016-11-15,Cámara Baja,Legislatura XII,Sesión plenaria núm. 14,-,2016-11-15,-,...,Opposition,Centro-izquierda,MiguelÁngelHerediaDíaz,"Heredia Díaz, Miguel Ángel",M,1966,False,ParlaMint-ES_2016-11-15-CD161115.u126,"Evidentemente, no, pero le puedo enseñar el ví...",evidentemente poder enseñar vídeo señora presi...


# DATA PREPARATION

In [3]:
sel_cols = ['date', 'party_status', 'speaker_party', 'speaker_gender', 'speaker_id', 'speaker_name', 'speaker_birth', 'term', 'speech_text_preprocessed']

data = data[sel_cols]

In [4]:
power_definer(data)

In [5]:
data.loc[data['speaker_id'] == 'CarmenRiobolosRegadera', 'speaker_id'] = 'CarmenRiolobosRegadera'

In [6]:
list_of_interest_parties = ['PP', 'PSOE', 'UP', 'Vox', 'ERC-S', 'JxCat-Junts', 'EAJ-PNV', 'EH Bildu', 'CiU', 'ERC-CATSÍ']

data = data[data['speaker_party'].isin(list_of_interest_parties)]

display(data[['speaker_name', 'speaker_party']].head())

Unnamed: 0,speaker_name,speaker_party
0,"De Olano Vela, Jaime",PP
1,"Calviño Santamaría, Nadia",PSOE
2,"De Olano Vela, Jaime",PP
3,"Calviño Santamaría, Nadia",PSOE
4,"Espinosa De Los Monteros De Simón, Iván",Vox


In [7]:
age_dict = {
    'JuanLuisSotoBurillo': 1978,
    'JoséCarlosDuránPeralta': 1987,
    'JoséCarlosDíazRodríguez': 1967,
    'CarmenRiolobosRegadera': 1953,
    'MaríaDelMarArnaizGarcía': 1966,
    'JoséIgnacioWertOrtega': 1950,
    'JoséManuelGarcíaMargalloyMarfil': 1944,
    'JulioVillarrubiaMediavilla': 1957,
    'EmilioBarbónMartínez': 1930,
    'CarmenCastellanoiFernández': 1959
}

# map speaker to age 
data.loc[data['speaker_birth'] == '-', 'speaker_birth'] = data.loc[data['speaker_birth'] == '-', 'speaker_id'].map(age_dict)
print(data['speaker_birth'].unique())

data[data['speaker_birth'] == '-']

['1970' '1968' '1971' '1972' '1967' '1966' '1988' '1959' '1962' '1979'
 '1976' '1957' '1978' '1965' '1977' '1985' '1975' '1949' '1969' 1978
 '1973' '1951' '1980' '1964' '1989' '1958' '1974' '1982' '1996' '1961'
 '1952' 1987 '1954' '1981' '1984' '1956' '1963' '1960' '1990' '1983'
 '1986' '1993' '1947' '1991' 1967 '1992' '1987' '1955' '1953' 1953 1966
 '1948' '1944' '1950' '1937' 1950 '1945' 1944 1957 1930 1959 '1946' '1943']


Unnamed: 0,date,speaker_party,speaker_gender,speaker_id,speaker_name,speaker_birth,term,speech_text_preprocessed,party_in_power


In [8]:
data['speaker_birth'] = data['speaker_birth'].astype(int)
age_getter(data)

In [9]:
print(data.speaker_gender.unique())

display(data[data['speaker_gender'] == 'U'])
data.loc[data.speaker_gender == 'U', 'speaker_gender'] = 'F'
print(data.speaker_gender.unique())

gender_map = {
    'M': 1,
    'F': 0
}

data['speaker_gender'] = data['speaker_gender'].map(gender_map)

['M' 'F' 'U']


Unnamed: 0,date,speaker_party,speaker_gender,speaker_id,speaker_name,term,speech_text_preprocessed,party_in_power,speaker_age
2762,2022-11-23,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,gracias señor presidente señorías señor garzón...,0,69
2909,2022-02-24,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,gracias señora presidenta señor garzón alegrar...,0,69
2910,2022-02-24,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,estarer pendiente consumidor ser importante bu...,0,69
4676,2022-02-01,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,gracias señor presidente sánchez cesar garzón ...,0,69
4692,2022-02-01,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,,0,69
15223,2021-03-11,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,gracia señora presidenta buen día señoría defi...,0,68
17939,2021-12-02,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,gracias señor presidente señorías señor minist...,0,68
20188,2021-11-24,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,gracias señor presidente señorías señor garzón...,0,68
20317,2021-09-29,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,señor garzón irrelevancia gobierno ser directa...,0,68
20319,2021-09-29,PP,U,CarmenRiolobosRegadera,"Riolobos Regadera, Carmen",Legislatura XIV,ideología traer pobreza subida energía haber p...,0,68


['M' 'F']


In [10]:
print(data.term.unique())

# label encode term columns 
print(data.term.unique())

term_map = {
    'Legislatura X': 0,
    'Legislatura XI': 1,
    'Legislatura XII': 2,
    'Legislatura XIII': 3,
    'Legislatura XIV': 4
}

data['term'] = data['term'].map(term_map)

print(data.term.unique())

['Legislatura XIV' 'Legislatura X' 'Legislatura XII' 'Legislatura XIII'
 'Legislatura XI']
['Legislatura XIV' 'Legislatura X' 'Legislatura XII' 'Legislatura XIII'
 'Legislatura XI']
[4 0 2 3 1]


In [11]:
data.dropna(subset=['speech_text_preprocessed'], inplace=True)

In [12]:
data['speech_text_preprocessed_tokenized'] = data['speech_text_preprocessed'].str.split()

In [13]:
# cut off len of speech text > 100 

min_len = 100

print(data.shape)
data = brickwall_limiter(data, 'speech_text_preprocessed_tokenized', min_len)
print(data.shape)

(22489, 10)
(14928, 11)


In [14]:
max_tokens_split = 300

print(data.shape)
data = limit_splitter(data, max_tokens_split)
print(data.shape)

(14928, 11)
(28849, 11)


In [15]:
data = brickwall_limiter(data, 'speech_text_preprocessed_tokenized', min_len)
print(data.shape)

(25560, 11)


In [16]:
display(data)

Unnamed: 0,date,speaker_party,speaker_gender,speaker_id,speaker_name,term,speech_text_preprocessed,party_in_power,speaker_age,speech_text_preprocessed_tokenized,speech_text_preprocessed_len
0,2022-06-29,PP,1,JaimeDeOlanoVela,"De Olano Vela, Jaime",4,señora calviño menos mal funcionar medida infl...,0,52,"[señora, calviño, menos, mal, funcionar, medid...",166
1,2022-06-29,PSOE,0,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",4,gracia ser malo suerte señor olano poner usted...,1,54,"[gracia, ser, malo, suerte, señor, olano, pone...",146
2,2022-06-29,Vox,1,IvánEspinosaDeLosMonterosDeSimón,"Espinosa De Los Monteros De Simón, Iván",4,gracias señora presidenta señora vicepresident...,0,51,"[gracias, señora, presidenta, señora, vicepres...",228
3,2022-06-29,PSOE,0,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",4,gracia presidenta realidad ser semana precisam...,1,54,"[gracia, presidenta, realidad, ser, semana, pr...",103
4,2022-06-29,PSOE,0,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",4,señor espinosa monteros ser evidente tener lug...,1,54,"[señor, espinosa, monteros, ser, evidente, ten...",125
...,...,...,...,...,...,...,...,...,...,...,...
28844,2016-11-15,PSOE,1,MiguelÁngelHerediaDíaz,"Heredia Díaz, Miguel Ángel",2,señora presidenta señoría contar amigo josé an...,0,50,"[señora, presidenta, señoría, contar, amigo, j...",111
28845,2016-11-15,PSOE,1,MiguelÁngelHerediaDíaz,"Heredia Díaz, Miguel Ángel",2,caso dinero público deber ser gastar honestida...,0,50,"[caso, dinero, público, deber, ser, gastar, ho...",212
28846,2016-11-15,PP,1,JaimeEduardoDeOlanoVela,"Olano Vela, Jaime Eduardo De",2,gracias señora presidenta debatimos hoy confor...,1,46,"[gracias, señora, presidenta, debatimos, hoy, ...",300
28847,2016-11-15,PP,1,JaimeEduardoDeOlanoVela,"Olano Vela, Jaime Eduardo De",2,gracias señora presidenta debatimos hoy confor...,1,46,"[web, transparencia, parlamento, catalán, mues...",143


In [34]:
"""
Without taking those giveaways out the model is performing at over 90% ROC AUC scores for the
PSOE-PP test set. This is good for this specific model, but it sacrifices a little bit of 
performance with regards to the Podemos-Vox test set.
"""

giveaways = ['pedro', 'sánchez', 'pablo', 'casado', 'mariano', 'rajoy', 'santiago', 'abascal', 
                'cristóbal', 'montoro', 'psoe', 'socialista', 'podemos', 'maría', 'montero', 'bildu', 
                'partido', 'popular', 'pp', 'mariano', 'rajoy', 'comú', 'podem', 'junts', 'per', 'vox',
                'vasco', 'pnv', 'simón', 'feijóo', 'aznar', 'arrimadas', 'illa', 'iglesias',
                'salvador', 'calviño', 'ábalos', 'robles', 'zapatero', 'marlaska', 'espinosa',
                'señora', 'señor', 'ministra', 'ministro', 'presidente', 'presidenta', 'gobierno', 
                'ser', 'haber', 'hacer', 'francina', 'armengol', 'decir', 'gracias', 'usted',
                'meritxell', 'batet', 'ana', 'pastor', 'patxi', 'lópez', 'jesús', 'posada']


spanish_stop_words = set(stopwords.words('spanish'))

# Combine the NLTK and custom stop words
custom_stopwords = spanish_stop_words.union(giveaways)

words_relevant = ['no', 'sí', 'bien', 'mal', 'me', 'mí']
for word in words_relevant:
    custom_stopwords.discard(word)

"""
If you want to run the model without those stopwords, just uncomment the line below.
"""
# custom_stopwords = spanish_stop_words

'\nIf you want to run the model without those stopwords, just uncomment the line below.\n'

In [38]:
data_test = [
    ['esto', 'es', 'una', 'gran', 'película', 'gracias', 'presidente'],
    ['me', 'gustó', 'mucho', 'esta', 'bien', 'película'],
    ['no', 'me', 'gustó', 'esta', 'mí', 'película']
]

# Function to remove stopwords from a token list
def remove_stopwords(tokens, stopwords_set):
    return [token for token in tokens if token not in stopwords_set]

# Remove stopwords from data
cleaned_data = [remove_stopwords(speech, custom_stopwords) for speech in data_test]

# Function to encode data using Word2Vec
def encode_data(cleaned_data, word2vec_model):
    encoded_data = []
    for speech in cleaned_data:
        speech_vectors = []
        for token in speech:
            if token in word2vec_model:
                speech_vectors.append(word2vec_model[token])
        encoded_data.append(speech_vectors)
    return encoded_data

# Encode the cleaned data using the loaded Word2Vec model
encoded_data = encode_data(cleaned_data, wordvecs)

# Example: Print the cleaned and encoded data
print("Cleaned Data:", cleaned_data)
print("Encoded Data:", encoded_data)

Cleaned Data: [['gran', 'película'], ['me', 'gustó', 'bien', 'película'], ['no', 'me', 'gustó', 'mí', 'película']]
Encoded Data: [[array([-0.15393   , -0.15813   ,  0.16842   , -0.15555   ,  0.21654   ,
       -0.016202  ,  0.073839  ,  0.30721   ,  0.015798  , -0.34165   ,
       -0.11739   ,  0.19655   ,  0.3026    , -0.22567   ,  0.1432    ,
        0.40844   ,  0.017996  , -0.28225   ,  0.13292   , -0.034182  ,
        0.020305  ,  0.1059    ,  0.12092   ,  0.12908   ,  0.12819   ,
       -0.13664   , -0.14189   , -0.17104   , -0.061941  , -0.13605   ,
        0.055521  , -0.2261    , -0.05704   ,  0.040523  ,  0.20444   ,
        0.15366   , -0.13015   ,  0.21872   , -0.27521   , -0.025821  ,
       -0.21685   , -0.064807  , -0.059582  , -0.23524   ,  0.26858   ,
        0.40237   ,  0.23195   , -0.15313   , -0.11227   ,  0.15825   ,
        0.18587   ,  0.0081863 , -0.062114  ,  0.16928   , -0.014986  ,
        0.27527   , -0.050818  , -0.079797  , -0.21316   , -0.03898   ,
     

# TRAIN-TEST SPLIT

In [18]:
podemos_vox = ['UP', 'Vox']
erc_junts = ['ERC-S', 'JxCat-Junts', 'CiU', 'ERC-CATSÍ']
bildu_pnv = ['EAJ-PNV', 'EH Bildu']
psoe_pp = ['PP', 'PSOE']

data_pv = data[data['speaker_party'].isin(podemos_vox)]
data_ej = data[data['speaker_party'].isin(erc_junts)]
data_bp = data[data['speaker_party'].isin(bildu_pnv)]
data = data[data['speaker_party'].isin(psoe_pp)]

In [19]:
data.loc[data['speaker_party'] == 'PSOE', 'speaker_party'] = 0
data.loc[data['speaker_party'] == 'PP', 'speaker_party'] = 1

data_pv.loc[data_pv['speaker_party'] == 'UP', 'speaker_party'] = 0
data_pv.loc[data_pv['speaker_party'] == 'Vox', 'speaker_party'] = 1

data_ej.loc[data_ej['speaker_party'] == 'ERC-S', 'speaker_party'] = 0
data_ej.loc[data_ej['speaker_party'] == 'JxCat-Junts', 'speaker_party'] = 1
data_ej.loc[data_ej['speaker_party'] == 'CiU', 'speaker_party'] = 1
data_ej.loc[data_ej['speaker_party'] == 'ERC-CATSÍ', 'speaker_party'] = 0

data_bp.loc[data_bp['speaker_party'] == 'EH Bildu', 'speaker_party'] = 0
data_bp.loc[data_bp['speaker_party'] == 'EAJ-PNV', 'speaker_party'] = 1

display(data[['speaker_name', 'speaker_party']].head())

Unnamed: 0,speaker_name,speaker_party
0,"De Olano Vela, Jaime",1
1,"Calviño Santamaría, Nadia",0
3,"Calviño Santamaría, Nadia",0
4,"Calviño Santamaría, Nadia",0
5,"Calviño Santamaría, Nadia",0


In [20]:
print(data['speaker_party'].value_counts(), data_pv['speaker_party'].value_counts(), data_ej['speaker_party'].value_counts(), data_bp['speaker_party'].value_counts())

speaker_party
1    7313
0    7169
Name: count, dtype: int64 speaker_party
0    2490
1    2142
Name: count, dtype: int64 speaker_party
0    1687
1    1495
Name: count, dtype: int64 speaker_party
1    2334
0     930
Name: count, dtype: int64


In [21]:
# Unique politicians and their party labels (assuming one politician belongs to only one party)
unique_politicians = data.groupby('speaker_id')['speaker_party'].first().reset_index()

# Splitting unique politicians into train and test to ensure no overlap
train_politicians, test_politicians = train_test_split(unique_politicians, stratify=unique_politicians['speaker_party'], test_size=0.1, random_state=168)

# Selecting rows for the train and test datasets based on the split politicians
train_df = data[data['speaker_id'].isin(train_politicians['speaker_id'])]
test_df = data[data['speaker_id'].isin(test_politicians['speaker_id'])]

In [22]:
print(train_df.shape)
print(test_df.shape)

# get intersection of speaker id in train and test
train_speaker_ids = train_df['speaker_id'].unique()
test_speaker_ids = test_df['speaker_id'].unique()

intersection = list(set(train_speaker_ids) & set(test_speaker_ids))
print(intersection)

(13174, 11)
(1308, 11)
[]


In [23]:
# split is near 50% on the training dataset
print(train_df.speaker_party.value_counts())
print(test_df.speaker_party.value_counts())

speaker_party
1    6599
0    6575
Name: count, dtype: int64
speaker_party
1    714
0    594
Name: count, dtype: int64


In [24]:
X = train_df['speech_text_preprocessed']
y = train_df['speaker_party']
X_test = test_df['speech_text_preprocessed']
y_test = test_df['speaker_party']

X_test_pv = data_pv['speech_text_preprocessed']
y_test_pv = data_pv['speaker_party']

X_test_ej = data_ej['speech_text_preprocessed']
y_test_ej = data_ej['speaker_party']

X_test_bp = data_bp['speech_text_preprocessed']
y_test_bp = data_bp['speaker_party']

In [25]:
display(data.head())

Unnamed: 0,date,speaker_party,speaker_gender,speaker_id,speaker_name,term,speech_text_preprocessed,party_in_power,speaker_age,speech_text_preprocessed_tokenized,speech_text_preprocessed_len
0,2022-06-29,1,1,JaimeDeOlanoVela,"De Olano Vela, Jaime",4,señora calviño menos mal funcionar medida infl...,0,52,"[señora, calviño, menos, mal, funcionar, medid...",166
1,2022-06-29,0,0,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",4,gracia ser malo suerte señor olano poner usted...,1,54,"[gracia, ser, malo, suerte, señor, olano, pone...",146
3,2022-06-29,0,0,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",4,gracia presidenta realidad ser semana precisam...,1,54,"[gracia, presidenta, realidad, ser, semana, pr...",103
4,2022-06-29,0,0,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",4,señor espinosa monteros ser evidente tener lug...,1,54,"[señor, espinosa, monteros, ser, evidente, ten...",125
5,2022-06-29,0,0,NadiaCalviñoSantamaría,"Calviño Santamaría, Nadia",4,gracias presidenta quedar ninguno duda haber p...,1,54,"[gracias, presidenta, quedar, ninguno, duda, h...",178


# TRANSFORMER MODEL

In [26]:
from gensim.models import KeyedVectors
wordvecs_file = '../SBW-vectors-300-min5.bin'
wordvecs1 = KeyedVectors.load_word2vec_format(wordvecs_file, binary=True)

In [27]:
embeddings_file = '../embeddings-l-model.vec'
wordvecs = KeyedVectors.load_word2vec_format(embeddings_file)

In [28]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [29]:
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)

NameError: name 'x_train' is not defined