In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import torch
# import cuml

from scipy.stats import pearsonr
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix as cf_matrix

In [2]:
if torch.cuda.is_available():
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 3
__CUDA Device Name: Tesla V100-PCIE-32GB
__CUDA Device Total Memory [GB]: 34.089926656


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_PSP_V2.csv")

In [5]:
cmp_df = pd.read_csv("/maps/hdir/gsw508/Thesis/Data/MPDataset_MPDS2022a.csv")

In [6]:
cmp_df['countryname'].unique()

array(['Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland', 'Belgium',
       'Netherlands', 'Luxembourg', 'France', 'Italy', 'Spain', 'Greece',
       'Portugal', 'Germany', 'Austria', 'Switzerland', 'United Kingdom',
       'Northern Ireland', 'Ireland', 'Malta', 'Cyprus', 'United States',
       'Canada', 'Australia', 'New Zealand', 'Japan', 'Israel',
       'Sri Lanka', 'Turkey', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Estonia', 'Georgia',
       'German Democratic Republic', 'Hungary', 'Latvia', 'Lithuania',
       'North Macedonia', 'Moldova', 'Montenegro', 'Poland', 'Romania',
       'Russia', 'Serbia', 'Slovakia', 'Slovenia', 'Ukraine',
       'South Korea', 'Mexico', 'South Africa'], dtype=object)

In [7]:
cmp_df.loc[(cmp_df['countryname'] == 'Czech Republic'), 'partyabbrev'].value_counts()

ČSSD              8
ODS               8
KDU-ČSL           7
KSČM              7
SPR-RSČ           4
TOP09             3
SZ                3
HSD-SMS           2
ODA               2
ANO               2
DŽJ               2
KDU-ČSL-US-DEU    1
SPD               1
STAN              1
Úsvit             1
VV                1
KSČ               1
US                1
OF                1
KDS               1
LSU               1
ODS-KDS           1
LB                1
KDU               1
ČSL               1
Piráti            1
Name: partyabbrev, dtype: int64

In [8]:
party_df = df.loc[df['party'].isna() == False]

In [9]:
cmp_df.loc[(cmp_df['countryname'] == 'Czech Republic') & (cmp_df['partyabbrev'] == 'HSD-SMS'), 'parfam']

3627    90
3637    90
Name: parfam, dtype: int64

In [10]:
party_df['party'].value_counts()

ODS                    98520
ČSSD                   98220
KSČM                   35830
KDU-ČSL                32639
other                  13334
TOP 09 a Starostové    10912
ODA                     8169
TOP09                   7999
ANO                     4302
VV                      4194
US                      2821
Nezařazení              2290
SPR-RSČ                 2276
KDS                     2271
Úsvit                   1541
US-DEU                  1208
LB                      1116
ČMUS                     543
LSU                      464
LSNS                     179
SZ                       175
HSD-SMS                   92
Nez.-SZ                   20
ONH                       11
ČMSS                       7
HSDMS                      2
Name: party, dtype: int64

In [11]:
# Partyfam 10, 20, 30
left_list = ['ČSSD', 'KSČM', 'LB', 'SZ']


In [12]:
party_df['left'] = 0

In [13]:
party_df.loc[party_df['party'].isin(left_list), 'left'] = 1

In [14]:
party_df['left'].value_counts()

0    193794
1    135341
Name: left, dtype: int64

In [15]:
party_df = party_df.loc[party_df['terms'] > 40]

In [16]:
party_df.to_json('csz_party_df.json')

socio-economic (class); religious (church–state); ethnic (center–periphery); urban–rural; post-materialist; and foreign policy

In [16]:
def preprocess(text):
    'Creating preprocessing pipeline'

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[()\.\,\?\!\"\'\-\`\:\;\\\%\*\[\]]+','', text)

    return text

In [17]:
nlp = spacy.load("xx_sent_ud_sm")
def token(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [18]:
df_preprocessed = party_df['text'].apply(lambda x: preprocess(x))

In [19]:
df_preprocessed

0         předseda psp milan uhde vážené paní poslankyně...
2         předseda psp milan uhde vážený pane předsedo v...
4         předseda vlády čr václav klaus  vážené paní po...
6         předseda psp milan uhde prohlášení české národ...
7         místopředseda psp jiří vlach děkuji předsedovi...
                                ...                        
329129    poslanec zbyněk stanjura  já mám procedurální ...
329130    místopředseda psp vojtěch filip  to bude jiná ...
329131    poslanec zbyněk stanjura  děkuji  já to plně r...
329133    poslanec františek laudát  děkuji za slovo  ne...
329134    místopředseda psp vojtěch filip  vážené paní a...
Name: text, Length: 195470, dtype: object

In [20]:
# Vectorizing 
vectorizer = TfidfVectorizer(min_df= 0.0001) # Default N-gram range is (1,1)
czc_features = vectorizer.fit_transform(df_preprocessed)

In [21]:
pickle.dump(vectorizer, open("czc_vectorizer.pickle", "wb"))
pickle.dump(czc_features, open("czc_features.pickle", "wb"))