In [1]:
import pandas as pd
import numpy as np
import spacy
import re
import pickle
import torch
import seaborn as sns
# import cuml

from scipy.stats import pearsonr
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, learning_curve, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix as cf_matrix

In [2]:
if torch.cuda.is_available():
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 3
__CUDA Device Name: Tesla V100-PCIE-32GB
__CUDA Device Total Memory [GB]: 34.089926656


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_csv(r"/maps/hdir/gsw508/Thesis/Data/ParlSpeech_V2/Corp_HouseOfCommons_V2.csv")

In [5]:
cmp_df = pd.read_csv("/maps/hdir/gsw508/Thesis/Data/MPDataset_MPDS2022a.csv")

In [6]:
cmp_df['countryname'].unique()

array(['Sweden', 'Norway', 'Denmark', 'Finland', 'Iceland', 'Belgium',
       'Netherlands', 'Luxembourg', 'France', 'Italy', 'Spain', 'Greece',
       'Portugal', 'Germany', 'Austria', 'Switzerland', 'United Kingdom',
       'Northern Ireland', 'Ireland', 'Malta', 'Cyprus', 'United States',
       'Canada', 'Australia', 'New Zealand', 'Japan', 'Israel',
       'Sri Lanka', 'Turkey', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Bosnia-Herzegovina', 'Bulgaria', 'Croatia',
       'Czech Republic', 'Estonia', 'Georgia',
       'German Democratic Republic', 'Hungary', 'Latvia', 'Lithuania',
       'North Macedonia', 'Moldova', 'Montenegro', 'Poland', 'Romania',
       'Russia', 'Serbia', 'Slovakia', 'Slovenia', 'Ukraine',
       'South Korea', 'Mexico', 'South Africa'], dtype=object)

In [7]:
cmp_df.loc[(cmp_df['countryname'] == 'United Kingdom'), 'partyabbrev'].value_counts()

Labour           21
Conservatives    21
LibDems           8
SNP               6
DUP               6
SF                5
UUP               4
UKIP              3
GPEW              3
PC                3
SDP               2
SDLP              2
Alliance          1
Name: partyabbrev, dtype: int64

In [8]:
party_df = df.loc[df['party'].isna() == False]

In [9]:
cmp_df.loc[(cmp_df['countryname'] == 'United Kingdom') & (cmp_df['partyabbrev'] == 'SDLP'), 'parfam']

2309    30
2329    30
Name: parfam, dtype: int64

In [10]:
# Partyfam 10, 20, 30
left_list = ['Lab', 'GPEW', 'SDLP', 'SDP'] 

# Sinn Fein does not sit in parliament, but is considered left-wing. As such, a party is missing from the data, kind of.
# Also, Respect could be considered left-wing, but has not been categorised by CMP.

In [11]:
party_df['left'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  party_df['left'] = 0


In [12]:
party_df.loc[party_df['party'].isin(left_list), 'left'] = 1

In [13]:
party_df['left'].value_counts()

0    1098808
1     761526
Name: left, dtype: int64

In [14]:
party_df['year'] = pd.DatetimeIndex(party_df['date']).year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  party_df['year'] = pd.DatetimeIndex(party_df['date']).year


In [None]:
sns.histplot(x = 'year', data = party_df,  binwidth = 1, hue = 'left');

In [14]:
party_df = party_df.loc[party_df['terms'] > 40]

In [16]:
party_df.to_json('eng_party_df.json')

socio-economic (class); religious (church–state); ethnic (center–periphery); urban–rural; post-materialist; and foreign policy

In [15]:
def preprocess(text):
    'Creating preprocessing pipeline'

    # Lowercasing
    text = text.lower()

    # Removing punctuation
    text = re.sub(r'[()\.\,\?\!\"\'\-\`\:\;\\\%\*\[\]]+','', text)

    return text

In [16]:
# !python3 -m spacy download xx_sent_ud_sm

In [17]:
nlp = spacy.load("xx_sent_ud_sm")
def token(text):
    doc = nlp(text)
    return [token.text for token in doc]

In [18]:
df_preprocessed = party_df['text'].apply(lambda x: preprocess(x))

In [19]:
df_preprocessed

2          i beg to move that an humble address be presen...
3          i am delighted to second the motion when i had...
4          i am sure that i speak for the majority of hon...
6          the hon gentleman says  appeal of course she i...
7          first i join the leader of the opposition in w...
                                 ...                        
1956213    may i associate myself with the remarks of the...
1956214    may i mr speakerelect give you heartfelt congr...
1956215    on behalf of the democratic unionist party mr ...
1956216    diolch yn fawr mr darparlefarydd a llongyfarch...
1956217    i join those who have paid tribute to the two ...
Name: text, Length: 1459922, dtype: object

In [20]:
# Vectorizing 
vectorizer = TfidfVectorizer(min_df= 0.0001) # Default N-gram range is (1,1)
eng_features = vectorizer.fit_transform(df_preprocessed)

In [21]:
pickle.dump(vectorizer, open("eng_vectorizer.pickle", "wb"))
pickle.dump(eng_features, open("eng_features.pickle", "wb"))

In [22]:
vectorizer

TfidfVectorizer(min_df=0.0001)

In [23]:
eng_features

<1459922x26026 sparse matrix of type '<class 'numpy.float64'>'
	with 169482699 stored elements in Compressed Sparse Row format>

In [24]:
pd.read_pickle("eng_features.pickle")

<1459922x26026 sparse matrix of type '<class 'numpy.float64'>'
	with 169482699 stored elements in Compressed Sparse Row format>