In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import torch
# import cuml

from scipy.stats import pearsonr
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix as cf_matrix

In [2]:
if torch.cuda.is_available():
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 3
__CUDA Device Name: Tesla V100-PCIE-32GB
__CUDA Device Total Memory [GB]: 34.089926656


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_Riksdagen_V2.csv")

In [5]:
cmp_df = pd.read_csv("/maps/hdir/gsw508/Thesis/Data/MPDataset_MPDS2022a.csv")

In [6]:
cmp_df['countryname'].unique()

array(['Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland', 'Belgium',
       'Netherlands', 'Luxembourg', 'France', 'Italy', 'Spain', 'Greece',
       'Portugal', 'Germany', 'Austria', 'Switzerland', 'United Kingdom',
       'Northern Ireland', 'Ireland', 'Malta', 'Cyprus', 'United States',
       'Canada', 'Australia', 'New Zealand', 'Japan', 'Israel',
       'Sri Lanka', 'Turkey', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Estonia', 'Georgia',
       'German Democratic Republic', 'Hungary', 'Latvia', 'Lithuania',
       'North Macedonia', 'Moldova', 'Montenegro', 'Poland', 'Romania',
       'Russia', 'Serbia', 'Slovakia', 'Slovenia', 'Ukraine',
       'South Korea', 'Mexico', 'South Africa'], dtype=object)

In [7]:
cmp_df.loc[(cmp_df['countryname'] == 'Sweden'), 'partyabbrev'].value_counts()

SAP    23
FP     22
CP     19
MSP    15
MP      9
VPK     8
V       8
SKP     7
Kd      6
SD      3
KdS     2
KDS     1
NyD     1
L       1
Name: partyabbrev, dtype: int64

In [8]:
cmp_df.loc[(cmp_df['countryname'] == 'Sweden'), 'partyname'].value_counts()

Social Democratic Labour Party          23
Centre Party                            19
People’s Party                          15
Moderate Coalition Party                15
Green Ecology Party                      9
Right Party                              8
Left Communists Party                    8
Left Party                               8
Communist Party of Sweden                7
Liberal People’s Party                   7
Christian Democrats                      6
Agrarian Party                           4
Sweden Democrats                         3
Christian Democratic Community Party     2
Christian Democratic Coalition           1
New Democracy                            1
Liberals                                 1
Name: partyname, dtype: int64

In [9]:
party_df = df.loc[df['party'].isna() == False]

In [10]:
cmp_df.loc[(cmp_df['countryname'] == 'Sweden') & (cmp_df['partyabbrev'] == 'NyD'), 'parfam']

84    95
Name: parfam, dtype: int64

In [11]:
party_df['party'].value_counts()

S      115764
M       70484
V       34953
FP      34219
C       33871
KD      29490
MP      29447
SD       7278
NYD      2962
L        2361
Name: party, dtype: int64

In [12]:
# Partyfam 10, 20, 30
left_list = ['S', 'V', 'MP']

# ???

In [13]:
party_df['left'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  party_df['left'] = 0


In [14]:
party_df.loc[party_df['party'].isin(left_list), 'left'] = 1

In [15]:
party_df['left'].value_counts()

0    180665
1    180164
Name: left, dtype: int64

In [16]:
party_df = party_df.loc[party_df['terms'] > 40]

In [17]:
party_df.to_json('swe_party_df.json')

socio-economic (class); religious (church–state); ethnic (center–periphery); urban–rural; post-materialist; and foreign policy

In [17]:
def preprocess(text):
    'Creating preprocessing pipeline'

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[()\.\,\?\!\"\'\-\`\:\;\\\%\*\[\]]+','', text)

    return text

In [18]:
nlp = spacy.load("xx_sent_ud_sm")
def token(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [19]:
df_preprocessed = party_df['text'].apply(lambda x: preprocess(x))

In [20]:
df_preprocessed

2         herr talman larsove hagberg har frågat mig om ...
3         herr talman när borlänge kommun olyckligtvis v...
4         herr talman i slutet av mitt svar anger jag at...
5         herr talman det komplicerade i denna fråga sku...
7         herr talman om jag får tolka arbetsmarknadsmin...
                                ...                        
365555    herr talman vi ska komma ihåg att vi har en ar...
365556    fru talman det som började så känslosamt och p...
365557    fru talman jag försökte vara stillsam hövlig o...
365558    fru talman då kom näringsministern i gång orde...
365559    fru talman jag vill inledningsvis tacka per sv...
Name: text, Length: 354030, dtype: object

In [21]:
# Vectorizing 
vectorizer = TfidfVectorizer(min_df= 0.0001) # Default N-gram range is (1,1)
swe_features = vectorizer.fit_transform(df_preprocessed)

In [22]:
pickle.dump(vectorizer, open("swe_vectorizer.pickle", "wb"))
pickle.dump(swe_features, open("swe_features.pickle", "wb"))