In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import torch
# import cuml

from scipy.stats import pearsonr
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix as cf_matrix

In [2]:
if torch.cuda.is_available():
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 3
__CUDA Device Name: Tesla V100-PCIE-32GB
__CUDA Device Total Memory [GB]: 34.089926656


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_TweedeKamer_V2.csv")

In [5]:
cmp_df = pd.read_csv("/maps/hdir/gsw508/Thesis/Data/MPDataset_MPDS2022a.csv")

In [6]:
cmp_df['countryname'].unique()

array(['Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland', 'Belgium',
       'Netherlands', 'Luxembourg', 'France', 'Italy', 'Spain', 'Greece',
       'Portugal', 'Germany', 'Austria', 'Switzerland', 'United Kingdom',
       'Northern Ireland', 'Ireland', 'Malta', 'Cyprus', 'United States',
       'Canada', 'Australia', 'New Zealand', 'Japan', 'Israel',
       'Sri Lanka', 'Turkey', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Estonia', 'Georgia',
       'German Democratic Republic', 'Hungary', 'Latvia', 'Lithuania',
       'North Macedonia', 'Moldova', 'Montenegro', 'Poland', 'Romania',
       'Russia', 'Serbia', 'Slovakia', 'Slovenia', 'Ukraine',
       'South Korea', 'Mexico', 'South Africa'], dtype=object)

In [7]:
cmp_df.loc[(cmp_df['countryname'] == 'Netherlands'), 'partyabbrev'].value_counts()

PvdA        22
VVD         21
D’66        16
CDA         13
SGP         12
KVP          9
ARP          9
CHU          9
GL           9
SP           8
PPR          6
CU           6
RPF          5
GPV          4
PvdD         4
PVV          4
CPN          3
DS‘70        3
LN           2
LPF          2
50PLUS       2
AOV          1
Unie 55+     1
PSP          1
PvdV         1
DENK         1
FvD          1
Name: partyabbrev, dtype: int64

In [8]:
cmp_df.loc[(cmp_df['countryname'] == 'Netherlands'), 'partyname'].value_counts()

Labour Party                                22
People’s Party for Freedom and Democracy    21
Democrats‘66                                16
Christian Democratic Appeal                 13
Reformed Political Party                    12
Catholic People’s Party                      9
Anti-Revolutionary Party                     9
Christian Historical Union                   9
Green Left                                   9
Socialist Party                              8
Christian Union                              6
Radical Political Party                      6
Reformatory Political Federation             5
Reformed Political League                    4
Party for the Animals                        4
Party of Freedom                             4
Democratic Socialists‘70                     3
Communist Party of the Netherlands           3
List Pim Fortuyn                             2
50Plus                                       2
Centre Democrats                             2
Livable Nethe

In [9]:
party_df = df.loc[df['party'].isna() == False]

In [10]:
cmp_df.loc[(cmp_df['countryname'] == 'Netherlands') & (cmp_df['partyabbrev'] == 'PvdA'), 'parfam']

981     30
988     30
994     30
999     30
1004    30
1009    30
1014    30
1021    30
1029    30
1037    30
1043    30
1050    30
1060    30
1066    30
1076    30
1088    30
1097    30
1107    30
1116    30
1126    30
1136    30
1147    30
Name: parfam, dtype: int64

In [11]:
party_df['party'].value_counts()

VVD       168276
PvdA      165516
CDA       161799
other     131339
D66       103579
SP         87637
GL         81225
PVV        38495
CU         34177
SGP        23460
PvdD       10242
LPF         6573
RPF         5860
GPV         5279
50PLUS      3221
DENK        3021
FvD         2034
Name: party, dtype: int64

In [12]:
# Partyfam 10, 20, 30
left_list = ['PvdA', 'GL', 'SP', 'DENK',]

# ???

In [13]:
party_df['left'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  party_df['left'] = 0


In [14]:
party_df.loc[party_df['party'].isin(left_list), 'left'] = 1

In [15]:
party_df['left'].value_counts()

0    694334
1    337399
Name: left, dtype: int64

In [16]:
party_df = party_df.loc[party_df['terms'] > 40]

In [17]:
party_df.to_json('nld_party_df.json')

socio-economic (class); religious (church–state); ethnic (center–periphery); urban–rural; post-materialist; and foreign policy

In [17]:
def preprocess(text):
    'Creating preprocessing pipeline'

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[()\.\,\?\!\"\'\-\`\:\;\\\%\*\[\]]+','', text)

    return text

In [18]:
nlp = spacy.load("xx_sent_ud_sm")
def token(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [19]:
df_preprocessed = party_df['text'].apply(lambda x: preprocess(x))

In [20]:
df_preprocessed

0          mijnheer de voorzitter ik vertel de minister w...
1          mijnheer de voorzitter mag ik allereerst de he...
2          mijnheer de voorzitter hoewel ik het antwoord ...
3          mijnheer de voorzitter wat is onrechtvaardig o...
4          voorzitter afgelopen zaterdag stond in nrc han...
                                 ...                        
1143345    voorzitter het was een enerverend ao het is al...
1143346    het zal u ongetwijfeld interesseren dat wij no...
1143348    voorzitter een kleine beginselverklaring een d...
1143349    de volgende motie motie de kamer gehoord de be...
1143358    ik zou graag weten op basis waarvan de staatss...
Name: text, Length: 740015, dtype: object

In [21]:
# Vectorizing 
vectorizer = TfidfVectorizer(min_df= 0.0001) # Default N-gram range is (1,1)
nld_features = vectorizer.fit_transform(df_preprocessed)

In [22]:
pickle.dump(vectorizer, open("nld_vectorizer.pickle", "wb"))
pickle.dump(nld_features, open("nld_features.pickle", "wb"))