In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import torch
# import cuml

from scipy.stats import pearsonr
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix as cf_matrix

In [2]:
if torch.cuda.is_available():
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 3
__CUDA Device Name: Tesla V100-PCIE-32GB
__CUDA Device Total Memory [GB]: 34.089926656


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_Bundestag_V2.csv")

  df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_Bundestag_V2.csv")


In [5]:
cmp_df = pd.read_csv("/maps/hdir/gsw508/Thesis/Data/MPDataset_MPDS2022a.csv")

In [6]:
cmp_df['countryname'].unique()

array(['Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland', 'Belgium',
       'Netherlands', 'Luxembourg', 'France', 'Italy', 'Spain', 'Greece',
       'Portugal', 'Germany', 'Austria', 'Switzerland', 'United Kingdom',
       'Northern Ireland', 'Ireland', 'Malta', 'Cyprus', 'United States',
       'Canada', 'Australia', 'New Zealand', 'Japan', 'Israel',
       'Sri Lanka', 'Turkey', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Estonia', 'Georgia',
       'German Democratic Republic', 'Hungary', 'Latvia', 'Lithuania',
       'North Macedonia', 'Moldova', 'Montenegro', 'Poland', 'Romania',
       'Russia', 'Serbia', 'Slovakia', 'Slovenia', 'Ukraine',
       'South Korea', 'Mexico', 'South Africa'], dtype=object)

In [7]:
cmp_df.loc[(cmp_df['countryname'] == 'Germany'), 'partyabbrev'].value_counts()

FDP          20
CDU/CSU      20
SPD          20
90/Greens     8
PDS           4
LINKE         4
AfD           3
DP            3
DZ            2
SSW           2
L-PDS         1
KPD           1
Greens/90     1
GB/BHE        1
DRP           1
WAV           1
BP            1
Name: partyabbrev, dtype: int64

In [8]:
cmp_df.loc[(cmp_df['countryname'] == 'Germany'), 'partyname'].value_counts()

Social Democratic Party of Germany                   20
Free Democratic Party                                20
Christian Democratic Union/Christian Social Union    20
Alliance‘90/Greens                                    8
The Left                                              4
Party of Democratic Socialism                         4
Alternative for Germany                               3
German Party                                          3
The Greens                                            2
South Schleswig Voters’ Union                         2
Centre Party                                          2
Refugee Party                                         1
Bavarian Party                                        1
Greens/Alliance‘90                                    1
German Reich Party                                    1
Economic Reconstruction League                        1
The Left. Party of Democratic Socialism               1
Pirates                                         

In [9]:
party_df = df.loc[df['party'].isna() == False]

In [10]:
cmp_df.loc[(cmp_df['countryname'] == 'Germany') & (cmp_df['partyabbrev'] == 'LINKE'), 'parfam']

1968    20
1973    20
1980    20
1986    20
Name: parfam, dtype: int64

In [11]:
party_df['party'].value_counts()

CDU/CSU        97081
SPD            88662
FDP            47701
GRUENE         43131
PDS/LINKE      29494
AfD             1495
independent      642
Name: party, dtype: int64

In [12]:
# Partyfam 10, 20, 30
left_list = ['SPD', 'GRUENE', 'PDS/LINKE']

# ???

In [13]:
party_df['left'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  party_df['left'] = 0


In [14]:
party_df.loc[party_df['party'].isin(left_list), 'left'] = 1

In [15]:
party_df['left'].value_counts()

1    161287
0    146919
Name: left, dtype: int64

In [16]:
party_df = party_df.loc[party_df['terms'] > 40]

In [17]:
party_df.to_json('deu_party_df.json')

socio-economic (class); religious (church–state); ethnic (center–periphery); urban–rural; post-materialist; and foreign policy

In [17]:
def preprocess(text):
    'Creating preprocessing pipeline'

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[()\.\,\?\!\"\'\-\`\:\;\\\%\*\[\]]+','', text)

    return text

In [18]:
nlp = spacy.load("xx_sent_ud_sm")
def token(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [19]:
df_preprocessed = party_df['text'].apply(lambda x: preprocess(x))

In [20]:
df_preprocessed

0         frau präsidentin  meine sehr geehrten damen un...
2         nicht viel besser sieht es im straßenwesen aus...
3         meine damen und herren  wie bereits heute morg...
4         geehrte frau präsidentin  meine damen und herr...
6         frau präsidentin  meine sehr verehrten damen u...
                                ...                        
379533    sehr geehrter herr präsident liebe kollegen un...
379535    sehr geehrter herr präsident meine sehr geehrt...
379537    herr präsident liebe kolleginnen liebe kollege...
379539    sehr geehrter herr präsident liebe kolleginnen...
379543    ich bedanke mich bei den mitarbeitern der verw...
Name: text, Length: 188664, dtype: object

In [21]:
# Vectorizing 
vectorizer = TfidfVectorizer(min_df= 0.0001) # Default N-gram range is (1,1)
deu_features = vectorizer.fit_transform(df_preprocessed)

In [22]:
pickle.dump(vectorizer, open("deu_vectorizer.pickle", "wb"))
pickle.dump(deu_features, open("deu_features.pickle", "wb"))

In [23]:
vectorizer

TfidfVectorizer(min_df=0.0001)