In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import torch
# import cuml

from scipy.stats import pearsonr
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix as cf_matrix

In [2]:
if torch.cuda.is_available():
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 3
__CUDA Device Name: Tesla V100-PCIE-32GB
__CUDA Device Total Memory [GB]: 34.089926656


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_NZHoR_V2.csv")

In [5]:
df

Unnamed: 0,date,agenda,speechnumber,speaker,party,party.facts.id,chair,terms,text,parliament,iso3country
0,1996-12-13,ELECTION OF SPEAKER,1,MR SPEAKER,,,True,333,"I have to report that, accompanied by members...",NZ-House_of_Representatives,NZL
1,1996-12-13,POINT OF ORDER---BUSINESS COMMITTEE,2,RICHARD PREBBLE,ACT,752.0,False,50,"I raise a point of order, Mr Speaker. I can r...",NZ-House_of_Representatives,NZL
2,1996-12-13,POINT OF ORDER---BUSINESS COMMITTEE,3,MR SPEAKER,,,True,6,I will hear the member.,NZ-House_of_Representatives,NZL
3,1996-12-13,POINT OF ORDER---BUSINESS COMMITTEE,4,RICHARD PREBBLE,ACT,752.0,False,108,"Under Standing Order 76, a Business Committee...",NZ-House_of_Representatives,NZL
4,1996-12-13,POINT OF ORDER---BUSINESS COMMITTEE,5,RICHARD PREBBLE,ACT,752.0,False,115,"I am obliged to the member. Certainly, we wer...",NZ-House_of_Representatives,NZL
...,...,...,...,...,...,...,...,...,...,...,...
925761,2016-11-09,RATES REBATE (RETIREMENT VILLAGE RESIDENTS) A...,278,RON MARK,NZ First,591.0,False,546,I rise with a couple of things on my mind. Fi...,NZ-House_of_Representatives,NZL
925762,2016-11-09,RATES REBATE (RETIREMENT VILLAGE RESIDENTS) A...,279,JOANNE HAYES,National,1824.0,False,294,I rise to take a brief call on the Rates Reba...,NZ-House_of_Representatives,NZL
925763,2016-11-09,RATES REBATE (RETIREMENT VILLAGE RESIDENTS) A...,280,BARRY COATES,Green,1099.0,False,221,I will take a short call in the interests of ...,NZ-House_of_Representatives,NZL
925764,2016-11-09,RATES REBATE (RETIREMENT VILLAGE RESIDENTS) A...,281,MATT DOOCEY,National,1824.0,False,675,It is a pleasure to rise and speak to the Rat...,NZ-House_of_Representatives,NZL


In [6]:
cmp_df = pd.read_csv("/maps/hdir/gsw508/Thesis/Data/MPDataset_MPDS2022a.csv")

In [7]:
cmp_df['countryname'].unique()

array(['Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland', 'Belgium',
       'Netherlands', 'Luxembourg', 'France', 'Italy', 'Spain', 'Greece',
       'Portugal', 'Germany', 'Austria', 'Switzerland', 'United Kingdom',
       'Northern Ireland', 'Ireland', 'Malta', 'Cyprus', 'United States',
       'Canada', 'Australia', 'New Zealand', 'Japan', 'Israel',
       'Sri Lanka', 'Turkey', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Estonia', 'Georgia',
       'German Democratic Republic', 'Hungary', 'Latvia', 'Lithuania',
       'North Macedonia', 'Moldova', 'Montenegro', 'Poland', 'Romania',
       'Russia', 'Serbia', 'Slovakia', 'Slovenia', 'Ukraine',
       'South Korea', 'Mexico', 'South Africa'], dtype=object)

In [8]:
cmp_df.loc[(cmp_df['countryname'] == 'New Zealand'), 'partyabbrev'].value_counts()

Labour         26
National       26
ACT             9
NZF             8
Greens          8
Alliance        3
Progressive     3
NZDP            2
Mana            1
Name: partyabbrev, dtype: int64

In [9]:
cmp_df.loc[(cmp_df['countryname'] == 'New Zealand'), 'partyname'].value_counts()

New Zealand Labour Party                26
New Zealand National Party              26
Social Credit Political League          11
ACT New Zealand                          9
New Zealand First Party                  8
Green Party of Aotearoa New Zealand      8
Māori Party                              6
United Future New Zealand                4
The Alliance                             3
New Zealand Democratic Party             2
Jim Anderton’s Progressive Coalition     1
Jim Anderton’s Progressive               1
Progressive Party                        1
Mana Party                               1
Name: partyname, dtype: int64

In [10]:
party_df = df.loc[df['party'].isna() == False]

In [11]:
cmp_df.loc[(cmp_df['countryname'] == 'New Zealand') & (cmp_df['partyname'] == 'Mana Party'), 'parfam']

2853    90
Name: parfam, dtype: int64

In [12]:
party_df['party'].value_counts()

National                    329562
Labour                      325685
NZ First                     53848
ACT                          32087
Green                        28119
Alliance                     13059
United Future                 6880
Maori                         5456
Progressive                   2001
United NZ                     1635
Independent                    981
Mauri Pacific                  806
NewLabour                      417
Christian Democrat Party       264
Te Tawharau                    241
Mana                           189
Liberal                        172
Conservative Party             135
Future                          46
Christian Heritage Party        39
Mana Wahine                     12
Name: party, dtype: int64

In [13]:
# Partyfam 10, 20, 30
left_list = ['Labour', 'Greens', 'Alliance', 'NewLabour']

# 

In [14]:
party_df['left'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  party_df['left'] = 0


In [15]:
party_df.loc[party_df['party'].isin(left_list), 'left'] = 1

In [16]:
party_df['left'].value_counts()

0    462473
1    339161
Name: left, dtype: int64

In [17]:
party_df = party_df.loc[party_df['terms'] > 40]

In [18]:
party_df.to_json('nzl_party_df.json')

socio-economic (class); religious (church–state); ethnic (center–periphery); urban–rural; post-materialist; and foreign policy

In [18]:
def preprocess(text):
    'Creating preprocessing pipeline'

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[()\.\,\?\!\"\'\-\`\:\;\\\%\*\[\]]+','', text)

    return text

In [19]:
nlp = spacy.load("xx_sent_ud_sm")
def token(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [20]:
df_preprocessed = party_df['text'].apply(lambda x: preprocess(x))

In [21]:
df_preprocessed

1          i raise a point of order mr speaker i can rai...
3          under standing order 76 a business committee ...
4          i am obliged to the member certainly we were ...
5          it may help the chamber if i indicate to the ...
7          i raise a further point of order mr speaker i...
                                ...                        
925761     i rise with a couple of things on my mind fir...
925762     i rise to take a brief call on the rates reba...
925763     i will take a short call in the interests of ...
925764     it is a pleasure to rise and speak to the rat...
925765     can i say what a pleasure it is to have broug...
Name: text, Length: 444080, dtype: object

In [22]:
# Vectorizing 
vectorizer = TfidfVectorizer(min_df= 0.0001) # Default N-gram range is (1,1)
nzl_features = vectorizer.fit_transform(df_preprocessed)

In [23]:
pickle.dump(vectorizer, open("nzl_vectorizer.pickle", "wb"))
pickle.dump(nzl_features, open("nzl_features.pickle", "wb"))