Notebook to perform different forms of text vectorization and try to find patterns on the parliamentary corpus.

In [1]:
import gensim
import multiprocessing
import logging
import pandas as pd
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1

### BoW

In [2]:
data = pd.read_csv('../data/final/plenos-xiv-tokens.csv')
data = data.loc[(data['political_group'].notnull())].reset_index(drop=True)
data.head()

Unnamed: 0.1,Unnamed: 0,legislatura,fecha,objeto_iniciativa,numero_expediente,autores,nombre_sesion,orador,enlace_pdf,text,political_group,cleaned_text
0,0,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Borràs Castanyer, Laura (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Presidenta, señorías, empieza el año y volvemo...",GPlu,"['empieza', 'año', 'volvemos', 'debate', 'inve..."
1,1,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Baldoví Roda, Joan (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Moltes gràcies, senyora presidenta. Hay alguno...",GPlu,"['moltes', 'gràcies', 'senyora', 'encanta', 'o..."
2,2,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Quevedo Iturbe, Pedro (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Señora presidenta, señorías, señor candidato a...",GPlu,"['candidato', 'presidencia', 'buenas', 'tardes..."
3,3,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Oramas González-Moro, Ana María (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Gracias, señora presidenta. Señorías, como dij...",GPlu,"['dijo', 'neruda', 'podría', 'escribir', 'vers..."
4,4,XIV,2020-01-04,Propuesta de candidato a la Presidencia del Go...,080/000001/0000,S.M. El Rey Don Felipe VI,Pleno,"Sabanés Nadal, Inés (GPlu)",https://www.congreso.es:443/public_oficiales/L...,"Gracias, presidenta. Señorías, a pesar de la e...",GPlu,"['pesar', 'exageración', 'pesar', 'sobreactuac..."


In [3]:
groups = ['GS', 'GP', 'GVOX', 'GCUP-EC-GC', 'GPlu', 'GR', 'GCs','GV (EAJ-PNV)', 'GEH Bildu', 'GMx']
data = data[data['political_group'].isin(groups)].reset_index(drop=True)
df = data[['orador', 'cleaned_text']]
df.columns = ['orador', 'text']
df = df.loc[df['text'] != 0].reset_index(drop=True)

In [7]:
import ast
corpus = df.values.tolist()
texts = df['text'].tolist()

In [8]:
#texts
texts_fixed = [ast.literal_eval(x) for x in texts]
#texts_fixed = [n.strip() for n in texts_fixed]

In [9]:
from gensim import corpora

dictionary = corpora.Dictionary(texts_fixed)
bows = [dictionary.doc2bow(text) for text in texts_fixed]
print(bows[15][:5])


2023-03-06 10:51:11,820 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2023-03-06 10:51:16,051 : INFO : built Dictionary(84633 unique tokens: ['abandonó', 'abogacía', 'abordar', 'abusos', 'acabarlo']...) from 9382 documents (total 3723280 corpus positions)
2023-03-06 10:51:16,051 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(84633 unique tokens: ['abandonó', 'abogacía', 'abordar', 'abusos', 'acabarlo']...) from 9382 documents (total 3723280 corpus positions)", 'datetime': '2023-03-06T10:51:16.051761', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:16:26) \n[Clang 12.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


[(2, 1), (7, 1), (21, 1), (35, 1), (71, 1)]


In [10]:
df_bow_origin = pd.DataFrame()
df_bow_origin['index'] = [i[0] for i in bows[15] if i]
df_bow_origin['occurrences'] = [i[1] for i in bows[15] if i]
df_bow_origin['token'] = [dictionary[index]for index in df_bow_origin['index']]
df_bow_origin.occurrences.sort_values(ascending=False).head(10)


32     11
276     6
113     6
126     5
156     5
60      4
141     4
20      4
200     3
152     3
Name: occurrences, dtype: int64

In [11]:
from gensim.models import TfidfModel
model = TfidfModel(bows)
model[bows[15]]


2023-03-06 10:51:21,773 : INFO : collecting document frequencies
2023-03-06 10:51:21,778 : INFO : PROGRESS: processing document #0
2023-03-06 10:51:22,695 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 9382 documents and 84633 features (2579690 matrix non-zeros)', 'datetime': '2023-03-06T10:51:22.695094', 'gensim': '4.1.2', 'python': '3.8.13 (default, Mar 28 2022, 06:16:26) \n[Clang 12.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'initialize'}


[(2, 0.028117579357296033),
 (7, 0.02656629736404507),
 (21, 0.02678049444674892),
 (35, 0.025018759795572044),
 (71, 0.013883160653293605),
 (78, 0.007624565746634823),
 (81, 0.03424920654781887),
 (97, 0.026651450550814533),
 (109, 0.010221237179281773),
 (121, 0.029205044755762303),
 (133, 0.0365397488564303),
 (147, 0.021765729783378714),
 (155, 0.042782894629652304),
 (157, 0.07493309933354857),
 (165, 0.020418288663510534),
 (167, 0.03651060694291918),
 (186, 0.026273604805454587),
 (193, 0.019203790841462308),
 (195, 0.022767996064207912),
 (200, 0.011313066964175814),
 (209, 0.030646429119320237),
 (225, 0.017990489529496827),
 (233, 0.040617853470283),
 (258, 0.03206299708068308),
 (262, 0.020177649217056726),
 (269, 0.048945009748375665),
 (271, 0.020536745105174812),
 (272, 0.0328533888487331),
 (283, 0.027328397339790016),
 (284, 0.05769483406083971),
 (296, 0.022876693794503074),
 (298, 0.03854788525385735),
 (309, 0.06668766539417927),
 (310, 0.015332829001402589),
 (311,

In [None]:
df_tfidf = pd.DataFrame()
df_tfidf['id'] = [i[0] for i in model[bows[15]]]
df_tfidf['score'] = [i[1] for i in model[bows[15]]]
df_tfidf['token'] = [dictionary[index] for index in df_tfidf['id']]
df_tfidf.score.sort_values(ascending=False).head(10)

113    0.255903
276    0.234674
156    0.225835
351    0.209034
327    0.153856
337    0.153856
152    0.146160
126    0.137023
340    0.122392
282    0.115801
Name: score, dtype: float64

In [10]:
from sklearn.feature_extraction import DictVectorizer
from collections import Counter, OrderedDict

v = DictVectorizer()
X = v.fit_transform(Counter(f) for f in texts_fixed)

In [11]:
sparse = pd.DataFrame(X.A)
sparse.columns = sorted(v.vocabulary_)


In [12]:
sparse


Unnamed: 0,000―,10n,11m,123―,12―,13ª,13―,15m,15m―,15ª,...,“es,“la,“millonarios,“no,“para,“por,“si,“vendo,“yo,‹piolines
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9379,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from sklearn.decomposition import PCA
dr = PCA(n_components=2)
Z = dr.fit_transform(X.A)
Z = pd.DataFrame(Z)
Z.head()

In [None]:
Z.columns = ['dim1', 'dim2']
Z['label'] = sparse.columns


In [None]:
Z