In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import torch
# import cuml

from scipy.stats import pearsonr
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix as cf_matrix

In [2]:
if torch.cuda.is_available():
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 3
__CUDA Device Name: Tesla V100-PCIE-32GB
__CUDA Device Total Memory [GB]: 34.089926656


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_Congreso_V2.csv")

In [5]:
cmp_df = pd.read_csv("/maps/hdir/gsw508/Thesis/Data/MPDataset_MPDS2022a.csv")

In [6]:
cmp_df['countryname'].unique()

array(['Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland', 'Belgium',
       'Netherlands', 'Luxembourg', 'France', 'Italy', 'Spain', 'Greece',
       'Portugal', 'Germany', 'Austria', 'Switzerland', 'United Kingdom',
       'Northern Ireland', 'Ireland', 'Malta', 'Cyprus', 'United States',
       'Canada', 'Australia', 'New Zealand', 'Japan', 'Israel',
       'Sri Lanka', 'Turkey', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Estonia', 'Georgia',
       'German Democratic Republic', 'Hungary', 'Latvia', 'Lithuania',
       'North Macedonia', 'Moldova', 'Montenegro', 'Poland', 'Romania',
       'Russia', 'Serbia', 'Slovakia', 'Slovenia', 'Ukraine',
       'South Korea', 'Mexico', 'South Africa'], dtype=object)

In [7]:
cmp_df.loc[(cmp_df['countryname'] == 'Spain'), 'partyabbrev'].value_counts()

PNV/EAJ                   15
ERC                       15
PSOE                      15
PP                        11
CiU                       10
IU                        10
BNG                        6
CC                         6
PAR                        6
PA                         6
EA                         6
EE                         5
C's                        4
AP                         4
EH Bildu                   4
CDS                        4
PCE                        3
UCD                        3
UPN                        2
JxCat                      2
CCa-PNC-NC                 2
PDP                        2
VOX                        2
CHA                        2
CC-PNC                     2
CDC                        1
PRC                        1
PL                         1
Compromís–Podemos–EUPV     1
DL                         1
UP                         1
FAC                        1
UPyD                       1
CUP                        1
Name: partyabb

In [8]:
cmp_df.loc[(cmp_df['countryname'] == 'Spain'), 'partyname'].value_counts()

Spanish Socialist Workers’ Party                     15
Basque Nationalist Party                             15
Catalan Republican Left                              15
People's Party                                       11
United Left                                          10
Convergence and Union                                10
Galician Nationalist Bloc                             6
Basque Solidarity                                     6
Canarian Coalition                                    6
Andalusian Party                                      6
Basque Left                                           5
Aragonese Regionalist Party                           4
We can                                                4
Centre Democrats                                      4
Basque Country Unite                                  4
Popular Alliance                                      4
Citizens - Party of the Citizens                      4
Communist Party of Spain                        

In [9]:
party_df = df.loc[df['party'].isna() == False]

In [10]:
cmp_df.loc[(cmp_df['countryname'] == 'Spain') & (cmp_df['partyabbrev'] == 'UPyD'), 'parfam']

1638    40
Name: parfam, dtype: int64

In [11]:
party_df.loc[party_df['party'] == 'GIP']

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country
181825,2011-12-13,ELECCIÓN DE LA MESA DEL CONGRESO DE LOS DIPUTA...,7,"Llamazares Trigo, Gaspar",GIP,,False,100,"Señor presidente, de acuerdo con el artículo 7...",ES-Congreso,ESP
181898,2011-12-19,DEBATE SOBRE LA INVESTIDURA DEL CANDIDATO A LA...,45,"Lara Moya, Cayo",GIP,,False,3990,"Señor Rajoy, quiero sumarme a las felicitacion...",ES-Congreso,ESP
181900,2011-12-19,DEBATE SOBRE LA INVESTIDURA DEL CANDIDATO A LA...,47,"Coscubiela Conesa, Joan",GIP,,False,1160,"Gracias, señor presidente. Señoras y señores d...",ES-Congreso,ESP
181902,2011-12-19,DEBATE SOBRE LA INVESTIDURA DEL CANDIDATO A LA...,49,"Coscubiela Conesa, Joan",GIP,,False,164,Termino en un segundo. Ha defendido usted prop...,ES-Congreso,ESP
181904,2011-12-19,DEBATE SOBRE LA INVESTIDURA DEL CANDIDATO A LA...,51,"Coscubiela Conesa, Joan",GIP,,False,22,...lleva a Iniciativa per Catalunya Verds a vo...,ES-Congreso,ESP
...,...,...,...,...,...,...,...,...,...,...,...
233820,2015-10-20,PROYECTO DE LEY DE PRESUPUESTOS GENERALES DEL ...,108,"Coscubiela Conesa, Joan",GIP,,False,317,"Gracias, señor presidente. Señorías, qué sabio...",ES-Congreso,ESP
233822,2015-10-20,PROYECTO DE LEY DE PRESUPUESTOS GENERALES DEL ...,110,"Coscubiela Conesa, Joan",GIP,,False,1194,"Gracias, señor presidente. Me temo que le van ...",ES-Congreso,ESP
233962,2015-10-21,"DEL DIPUTADO DON CAYO LARA MOYA, DEL GRUPO PAR...",75,"Lara Moya, Cayo",GIP,,False,18,"Gracias, presidente. Señor ministro, ¿en qué f...",ES-Congreso,ESP
233966,2015-10-21,"DEL DIPUTADO DON CAYO LARA MOYA, DEL GRUPO PAR...",79,"Lara Moya, Cayo",GIP,,False,382,"Señor ministro, hace treinta años que este paí...",ES-Congreso,ESP


In [12]:
party_df['party'].value_counts()

GPP           32420
GPSOE         28302
GMX           13881
GC-CiU         7916
GV EAJ-PNV     4737
GIU            4084
GIP            2289
GER-IU-ICV     1784
GV-PNV         1722
GUPyD          1670
GCUP-EC-EM     1619
GCC            1613
GIU-ICV        1446
GER-ERC        1377
GCs            1197
GER             856
GCC-NC          200
GC-DL            54
Name: party, dtype: int64

In [13]:
# Partyfam 10, 20, 30
left_list = ['GPSOE', 'GIU', 'GIP', 'GCUP-EC-EM']

# Difficult distinction between left and ethnic. Errs towards ethnic if unsure.

In [14]:
party_df['left'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  party_df['left'] = 0


In [15]:
party_df.loc[party_df['party'].isin(left_list), 'left'] = 1

In [16]:
party_df['left'].value_counts()

0    70873
1    36294
Name: left, dtype: int64

In [17]:
party_df = party_df.loc[party_df['terms'] > 40]

In [18]:
party_df.to_json('esp_party_df.json')

socio-economic (class); religious (church–state); ethnic (center–periphery); urban–rural; post-materialist; and foreign policy

In [18]:
def preprocess(text):
    'Creating preprocessing pipeline'

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[()\.\,\?\!\"\'\-\`\:\;\\\%\*\[\]]+','', text)

    return text

In [19]:
nlp = spacy.load("xx_sent_ud_sm")
def token(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [20]:
df_preprocessed = party_df['text'].apply(lambda x: preprocess(x))

In [21]:
df_preprocessed

35        señor presidente comparezco ante ss ss para so...
38        señor presidente señorías subo a la tribuna pa...
40        señor presidente señorías en primer lugar quie...
42        muchas gracias señor presidente señorías inten...
44        señor presidente señorías voy a contestar con ...
                                ...                        
262185    gracias presidente compañeros diputados compañ...
262187    gracias presidente estamos cansados pero nos e...
262189    gracias señora presidenta señoras diputadas y ...
262191    como les decía mi grupo votó favorablemente po...
262193     de entender la aplicación de este artículo 13...
Name: text, Length: 93493, dtype: object

In [22]:
# Vectorizing 
vectorizer = TfidfVectorizer(min_df= 0.0001) # Default N-gram range is (1,1)
esp_features = vectorizer.fit_transform(df_preprocessed)

In [23]:
pickle.dump(vectorizer, open("esp_vectorizer.pickle", "wb"))
pickle.dump(esp_features, open("esp_features.pickle", "wb"))