In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import torch
# import cuml

from scipy.stats import pearsonr
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix as cf_matrix

In [2]:
if torch.cuda.is_available():
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 3
__CUDA Device Name: Tesla V100-PCIE-32GB
__CUDA Device Total Memory [GB]: 34.089926656


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_Nationalrat_V2.csv")

In [5]:
cmp_df = pd.read_csv("/maps/hdir/gsw508/Thesis/Data/MPDataset_MPDS2022a.csv")

In [6]:
cmp_df['countryname'].unique()

array(['Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland', 'Belgium',
       'Netherlands', 'Luxembourg', 'France', 'Italy', 'Spain', 'Greece',
       'Portugal', 'Germany', 'Austria', 'Switzerland', 'United Kingdom',
       'Northern Ireland', 'Ireland', 'Malta', 'Cyprus', 'United States',
       'Canada', 'Australia', 'New Zealand', 'Japan', 'Israel',
       'Sri Lanka', 'Turkey', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Estonia', 'Georgia',
       'German Democratic Republic', 'Hungary', 'Latvia', 'Lithuania',
       'North Macedonia', 'Moldova', 'Montenegro', 'Poland', 'Romania',
       'Russia', 'Serbia', 'Slovakia', 'Slovenia', 'Ukraine',
       'South Korea', 'Mexico', 'South Africa'], dtype=object)

In [7]:
cmp_df.loc[(cmp_df['countryname'] == 'Austria'), 'partyabbrev'].value_counts()

SPÖ      22
ÖVP      22
FPÖ      19
GRÜNE     9
NEOS      3
VdU       2
GA        2
LIF       2
KPÖ       2
BZÖ       2
TS        1
PILZ      1
Name: partyabbrev, dtype: int64

In [8]:
cmp_df.loc[(cmp_df['countryname'] == 'Austria'), 'partyname'].value_counts()

Austrian Social Democratic Party      22
Austrian People’s Party               22
Austrian Freedom Party                19
The Greens                             9
League of Independents                 2
Green Alternative                      2
Liberal Forum                          2
Austrian Communist Party               2
Alliance for the Future of Austria     2
The New Austria and Liberal Forum      2
Freedom Movement                       1
The New Austria                        1
Team Stronach for Austria              1
Peter Pilz List                        1
Name: partyname, dtype: int64

In [9]:
party_df = df.loc[df['party'].isna() == False]

In [10]:
cmp_df.loc[(cmp_df['countryname'] == 'Austria') & (cmp_df['partyabbrev'] == 'PILZ'), 'parfam']

2070    10
Name: parfam, dtype: int64

In [11]:
party_df.loc[party_df['party'] == 'GIP']

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country


In [12]:
party_df['party'].value_counts()

SPÖ                   24747
ÖVP                   24063
FPÖ                   17308
Grüne                 12455
BZÖ                    4170
LIF                    1942
NEOS                   1866
independent            1555
STRONACH               1322
PILZ                    214
JETZT                    80
Jetzt – Liste PILZ       52
Name: party, dtype: int64

In [13]:
# Partyfam 10, 20, 30
left_list = ['SPÖ', 'Grüne', 'PILZ', 'JETZT', 'Jetzt – Liste PILZ']

# ???

In [14]:
party_df['left'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  party_df['left'] = 0


In [15]:
party_df.loc[party_df['party'].isin(left_list), 'left'] = 1

In [16]:
party_df['left'].value_counts()

0    52226
1    37548
Name: left, dtype: int64

In [17]:
party_df = party_df.loc[party_df['terms'] > 40]

In [18]:
party_df.to_json('aut_party_df.json')

socio-economic (class); religious (church–state); ethnic (center–periphery); urban–rural; post-materialist; and foreign policy

In [18]:
def preprocess(text):
    'Creating preprocessing pipeline'

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[()\.\,\?\!\"\'\-\`\:\;\\\%\*\[\]]+','', text)

    return text

In [19]:
nlp = spacy.load("xx_sent_ud_sm")
def token(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [20]:
df_preprocessed = party_df['text'].apply(lambda x: preprocess(x))

In [21]:
df_preprocessed

4          herr präsident herr vizekanzler herr bundesmi...
6          selbstverständlich herr präsident ich nehme b...
8          herr präsident sehr geehrte damen und herren ...
10         herr präsident herr vizekanzler frau staatsse...
12         sehr geehrter herr präsident meine sehr geehr...
                                ...                        
199470     wenn sie so laut dazwischenschreien kann das ...
199472     machen sie mir einen anderen formulierungsvor...
199474     wenn das die ganz normale berufliche tätigkei...
199476     ich werde mich nicht vom kollegen peter jetzt...
199478     hohes präsidium – herr ausschussvorsitzender ...
Name: text, Length: 86067, dtype: object

In [22]:
# Vectorizing 
vectorizer = TfidfVectorizer(min_df= 0.0001) # Default N-gram range is (1,1)
aut_features = vectorizer.fit_transform(df_preprocessed)

In [23]:
pickle.dump(vectorizer, open("aut_vectorizer.pickle", "wb"))
pickle.dump(aut_features, open("aut_features.pickle", "wb"))