In [2]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd

In [3]:
# collect every year's speech data and party affiliation data 
for year in range(2015, 2022):
    speech = [] # a list to collect speech data
    party = [] # a list to collect party affiliation data
    file_names = pd.read_csv(str(year)+"_fnames.txt", header=None) # read the file name lists of the year
    for f in file_names[0]:
        if '.txt' in f: # txt file contains the speech data
            text_data = pd.read_csv(str(year)+"/"+f, sep='\t', header=None)
            speech.append(text_data.iloc[:,[1]])
        else: # tsv file contains the party affiliation data
            label_data = pd.read_csv(str(year)+"/"+f, sep='\t')
            party.append(label_data[['Speaker_party']])
            
    if year == 2015:
        speech_party_2015 = pd.concat([speech[0],party[0]],axis=1) 
        for i in range(1,len(speech)):
            sp = pd.concat([speech[i],party[i]],axis=1)
            speech_party_2015 = pd.concat([speech_party_2015, sp], ignore_index=True)
    elif year == 2016:        
        speech_party_2016 = pd.concat([speech[0],party[0]],axis=1)
        for i in range(1,len(speech)):
            sp = pd.concat([speech[i],party[i]],axis=1)
            speech_party_2016 = pd.concat([speech_party_2016, sp], ignore_index=True)
    elif year == 2017:        
        speech_party_2017 = pd.concat([speech[0],party[0]],axis=1)
        for i in range(1,len(speech)):
            sp = pd.concat([speech[i],party[i]],axis=1)
            speech_party_2017 = pd.concat([speech_party_2017, sp], ignore_index=True)
    elif year == 2018:        
        speech_party_2018 = pd.concat([speech[0],party[0]],axis=1)
        for i in range(1,len(speech)):
            sp = pd.concat([speech[i],party[i]],axis=1)
            speech_party_2018 = pd.concat([speech_party_2018, sp], ignore_index=True)
    elif year == 2019:        
        speech_party_2019 = pd.concat([speech[0],party[0]],axis=1)
        for i in range(1,len(speech)):
            sp = pd.concat([speech[i],party[i]],axis=1)
            speech_party_2019 = pd.concat([speech_party_2019, sp], ignore_index=True)
    elif year == 2020:        
        speech_party_2020 = pd.concat([speech[0],party[0]],axis=1)
        for i in range(1,len(speech)):
            sp = pd.concat([speech[i],party[i]],axis=1)
            speech_party_2020 = pd.concat([speech_party_2020, sp], ignore_index=True)
    elif year == 2021:        
        speech_party_2021 = pd.concat([speech[0],party[0]],axis=1)
        for i in range(1,len(speech)):
            sp = pd.concat([speech[i],party[i]],axis=1)
            speech_party_2021 = pd.concat([speech_party_2021, sp], ignore_index=True)

In [4]:
# put all of the year data in one data frame
df_list = [speech_party_2015, speech_party_2016, speech_party_2017, speech_party_2018, speech_party_2019, speech_party_2020, speech_party_2021]
speech_party = pd.concat(df_list, ignore_index=True)

In [8]:
# count the total number of dataset
speech_party['Speaker_party'].count()

550489

In [9]:
# count values
speech_party['Speaker_party'].value_counts()

CON            321842
LAB            133548
LD              30220
SNP             27004
CB              15646
DUP              6382
I                2531
PC               2415
-                2393
BI               1771
GP               1766
CON;I             824
SDLP              813
UUP               710
LAB;NA            573
UKIP              490
CON;NA            225
IL                158
IGC               155
CB;NA             139
A                 129
LI                 96
IGC;LJ95           90
PC;I               88
LD;NA              73
I;LD               70
QMZZ               53
L8TA               49
NA;LAB             44
CON;CB             31
LD;CB              24
I;CON              23
ZKPW               21
64RT               18
SNP;I              18
LAB;I              16
CON;I;LD            9
UKIP;NA             8
NA;LD               7
NA;CB               7
CON;LD              3
UKIP;I              2
CON;0UBS;NA         2
LD;NA;CB            1
I;LAB               1
CON;NA;CB 

In [4]:
# collect the speech and party affiliation data of 2015 with downsized samples of 1000
speech = [] # a list to collect speech data
party = [] # a list to collect party affiliation data
file_names = pd.read_csv("2015_fnames.txt", header=None) # read the file name lists of the year 2015
for f in file_names[0]:
    if '.txt' in f: # txt file contains the speech data
        text_data = pd.read_csv("2015/"+f, sep='\t', header=None)
        speech.append(text_data.iloc[:,[1]])
    else: # tsv file contains the party affiliation data
        label_data = pd.read_csv("2015/"+f, sep='\t')
        party.append(label_data[['Speaker_party']])
        
speech_party = pd.concat([speech[0],party[0]],axis=1) 
for i in range(1,len(speech)):
    sp = pd.concat([speech[i],party[i]],axis=1)
    speech_party = pd.concat([speech_party, sp], ignore_index=True)
# speech_party['Speaker_party'].value_counts()
df_CON = speech_party[speech_party['Speaker_party']=='CON'] # right party
df_LAB = speech_party[speech_party['Speaker_party']=='LAB'] # left party
            
df_CON_downsampled = df_CON.sample(1000) # downsample to 1000 
df_LAB_downsampled = df_LAB.sample(1000) # downsample to 1000
            
df_balanced_2015 = pd.concat([df_CON_downsampled,df_LAB_downsampled]) # downsampled speech data of 2015
df_balanced_2015['Speaker_party'].value_counts()

CON    1000
LAB    1000
Name: Speaker_party, dtype: int64

In [5]:
# collect the speech and party affiliation data of 2016 with downsized samples of 1000
speech = [] # a list to collect speech data
party = [] # a list to collect party affiliation data
file_names = pd.read_csv("2016_fnames.txt", header=None) # read the file name lists of the year 2016
for f in file_names[0]:
    if '.txt' in f: # txt file contains the speech data
        text_data = pd.read_csv("2016/"+f, sep='\t', header=None)
        speech.append(text_data.iloc[:,[1]])
    else: # tsv file contains the party affiliation data
        label_data = pd.read_csv("2016/"+f, sep='\t')
        party.append(label_data[['Speaker_party']])
        
speech_party = pd.concat([speech[0],party[0]],axis=1) 
for i in range(1,len(speech)):
    sp = pd.concat([speech[i],party[i]],axis=1)
    speech_party = pd.concat([speech_party, sp], ignore_index=True)
# speech_party['Speaker_party'].value_counts()
df_CON = speech_party[speech_party['Speaker_party']=='CON'] # right party
df_LAB = speech_party[speech_party['Speaker_party']=='LAB'] # left party
            
df_CON_downsampled = df_CON.sample(1000) # downsample to 1000 
df_LAB_downsampled = df_LAB.sample(1000) # downsample to 1000
            
df_balanced_2016 = pd.concat([df_CON_downsampled,df_LAB_downsampled]) # downsampled speech data of 2016
df_balanced_2016['Speaker_party'].value_counts()

CON    1000
LAB    1000
Name: Speaker_party, dtype: int64

In [6]:
# collect the speech and party affiliation data of 2017 with downsized samples of 1000
speech = [] # a list to collect speech data
party = [] # a list to collect party affiliation data
file_names = pd.read_csv("2017_fnames.txt", header=None) # read the file name lists of the year
for f in file_names[0]:
    if '.txt' in f: # txt file contains the speech data
        text_data = pd.read_csv("2017/"+f, sep='\t', header=None)
        speech.append(text_data.iloc[:,[1]])
    else: # tsv file contains the party affiliation data
        label_data = pd.read_csv("2017/"+f, sep='\t')
        party.append(label_data[['Speaker_party']])
        
speech_party = pd.concat([speech[0],party[0]],axis=1) 
for i in range(1,len(speech)):
    sp = pd.concat([speech[i],party[i]],axis=1)
    speech_party = pd.concat([speech_party, sp], ignore_index=True)
# speech_party['Speaker_party'].value_counts()
df_CON = speech_party[speech_party['Speaker_party']=='CON'] # right party
df_LAB = speech_party[speech_party['Speaker_party']=='LAB'] # left party
            
df_CON_downsampled = df_CON.sample(1000) # downsample to 1000 
df_LAB_downsampled = df_LAB.sample(1000) # downsample to 1000
            
df_balanced_2017 = pd.concat([df_CON_downsampled,df_LAB_downsampled]) # downsampled speech data of 2017
df_balanced_2017['Speaker_party'].value_counts()

CON    1000
LAB    1000
Name: Speaker_party, dtype: int64

In [7]:
# collect the speech and party affiliation data of 2018 with downsized samples of 1000
speech = [] # a list to collect speech data
party = [] # a list to collect party affiliation data
file_names = pd.read_csv("2018_fnames.txt", header=None) # read the file name lists of the year, 2018
for f in file_names[0]:
    if '.txt' in f: # txt file contains the speech data
        text_data = pd.read_csv("2018/"+f, sep='\t', header=None)
        speech.append(text_data.iloc[:,[1]])
    else: # tsv file contains the party affiliation data
        label_data = pd.read_csv("2018/"+f, sep='\t')
        party.append(label_data[['Speaker_party']])
        
speech_party = pd.concat([speech[0],party[0]],axis=1) 
for i in range(1,len(speech)):
    sp = pd.concat([speech[i],party[i]],axis=1)
    speech_party = pd.concat([speech_party, sp], ignore_index=True)
# speech_party['Speaker_party'].value_counts()
df_CON = speech_party[speech_party['Speaker_party']=='CON'] # right party
df_LAB = speech_party[speech_party['Speaker_party']=='LAB'] # left party
            
df_CON_downsampled = df_CON.sample(1000) # downsample to 1000 
df_LAB_downsampled = df_LAB.sample(1000) # downsample to 1000
            
df_balanced_2018 = pd.concat([df_CON_downsampled,df_LAB_downsampled]) # downsampled speech data of 2018
df_balanced_2018['Speaker_party'].value_counts()

CON    1000
LAB    1000
Name: Speaker_party, dtype: int64

In [8]:
# collect the speech and party affiliation data of 2019 with downsized samples of 1000
speech = [] # a list to collect speech data
party = [] # a list to collect party affiliation data
file_names = pd.read_csv("2019_fnames.txt", header=None) # read the file name lists of the year, 2019
for f in file_names[0]:
    if '.txt' in f: # txt file contains the speech data
        text_data = pd.read_csv("2019/"+f, sep='\t', header=None)
        speech.append(text_data.iloc[:,[1]])
    else: # tsv file contains the party affiliation data
        label_data = pd.read_csv("2019/"+f, sep='\t')
        party.append(label_data[['Speaker_party']])
        
speech_party = pd.concat([speech[0],party[0]],axis=1) 
for i in range(1,len(speech)):
    sp = pd.concat([speech[i],party[i]],axis=1)
    speech_party = pd.concat([speech_party, sp], ignore_index=True)
# speech_party['Speaker_party'].value_counts()
df_CON = speech_party[speech_party['Speaker_party']=='CON'] # right party
df_LAB = speech_party[speech_party['Speaker_party']=='LAB'] # left party
            
df_CON_downsampled = df_CON.sample(1000) # downsample to 1000 
df_LAB_downsampled = df_LAB.sample(1000) # downsample to 1000
            
df_balanced_2019 = pd.concat([df_CON_downsampled,df_LAB_downsampled]) # downsampled speech data of 2019
df_balanced_2019['Speaker_party'].value_counts()

CON    1000
LAB    1000
Name: Speaker_party, dtype: int64

In [9]:
# collect the speech and party affiliation data of 2020 with downsized samples of 1000
speech = [] # a list to collect speech data
party = [] # a list to collect party affiliation data
file_names = pd.read_csv("2020_fnames.txt", header=None) # read the file name lists of the year, 2020
for f in file_names[0]:
    if '.txt' in f: # txt file contains the speech data
        text_data = pd.read_csv("2020/"+f, sep='\t', header=None)
        speech.append(text_data.iloc[:,[1]])
    else: # tsv file contains the party affiliation data
        label_data = pd.read_csv("2020/"+f, sep='\t')
        party.append(label_data[['Speaker_party']])
        
speech_party = pd.concat([speech[0],party[0]],axis=1) 
for i in range(1,len(speech)):
    sp = pd.concat([speech[i],party[i]],axis=1)
    speech_party = pd.concat([speech_party, sp], ignore_index=True)
# speech_party['Speaker_party'].value_counts()
df_CON = speech_party[speech_party['Speaker_party']=='CON'] # right party
df_LAB = speech_party[speech_party['Speaker_party']=='LAB'] # left party
            
df_CON_downsampled = df_CON.sample(1000) # downsample to 1000 
df_LAB_downsampled = df_LAB.sample(1000) # downsample to 1000
            
df_balanced_2020 = pd.concat([df_CON_downsampled,df_LAB_downsampled]) # downsampled speech data of 2020
df_balanced_2020['Speaker_party'].value_counts()

CON    1000
LAB    1000
Name: Speaker_party, dtype: int64

In [10]:
# collect the speech and party affiliation data of 2021 with downsized samples of 1000
speech = [] # a list to collect speech data
party = [] # a list to collect party affiliation data
file_names = pd.read_csv("2021_fnames.txt", header=None) # read the file name lists of the year, 2021
for f in file_names[0]:
    if '.txt' in f: # txt file contains the speech data
        text_data = pd.read_csv("2021/"+f, sep='\t', header=None)
        speech.append(text_data.iloc[:,[1]])
    else: # tsv file contains the party affiliation data
        label_data = pd.read_csv("2021/"+f, sep='\t')
        party.append(label_data[['Speaker_party']])
        
speech_party = pd.concat([speech[0],party[0]],axis=1) 
for i in range(1,len(speech)):
    sp = pd.concat([speech[i],party[i]],axis=1)
    speech_party = pd.concat([speech_party, sp], ignore_index=True)
# speech_party['Speaker_party'].value_counts()
df_CON = speech_party[speech_party['Speaker_party']=='CON'] # right party
df_LAB = speech_party[speech_party['Speaker_party']=='LAB'] # left party
            
df_CON_downsampled = df_CON.sample(1000) # downsample to 1000 
df_LAB_downsampled = df_LAB.sample(1000) # downsample to 1000
            
df_balanced_2021 = pd.concat([df_CON_downsampled,df_LAB_downsampled]) # downsampled speech data of 2021
df_balanced_2021['Speaker_party'].value_counts()

CON    1000
LAB    1000
Name: Speaker_party, dtype: int64

In [11]:
# put all of the year data in one data frame
df_list = [df_balanced_2015, df_balanced_2016, df_balanced_2017, df_balanced_2018, df_balanced_2019, df_balanced_2020, df_balanced_2021]
speech_party = pd.concat(df_list, ignore_index=True)
speech_party.rename(columns={1:'Speech'}, inplace=True) # change the name of column which contains speech data as 'Speech'

In [12]:
# label our dataset into 1 and 0
# if it is right party, label would be 1 and if it is left party, label would be 0
speech_party['CON']=speech_party['Speaker_party'].apply(lambda x: 1 if x=='CON' else 0) 
speech_party

Unnamed: 0,Speech,Speaker_party,CON
0,What does the Secretary of State believe the n...,CON,1
1,"Given the importance of value for money, does ...",CON,1
2,"My Lords, with Amendment 33Y, we come to the i...",CON,1
3,I give my hon. Friend an absolute assurance. O...,CON,1
4,I will conclude by saying that some remarkable...,CON,1
...,...,...,...
13995,"As we know, we are now a year into this pandem...",LAB,0
13996,May I first declare that I am a member of Unit...,LAB,0
13997,"My Lords, many experts conclude that without n...",LAB,0
13998,One of my constituents is a long-term in-patie...,LAB,0


In [13]:
# split the dataset using the train_test_split
from sklearn.model_selection import train_test_split

X = speech_party.Speech.values
y = speech_party.CON.values

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=speech_party['CON'])

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# word vectorization using TF-IDF so that speech data(text data) can be used in SVM classifier
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(speech_party['Speech'])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

In [25]:
from sklearn import svm
from sklearn.metrics import classification_report

# fit the training dataset on the SVM classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_Tfidf,y_train)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test_Tfidf)

# classification report of the current NB classifier
print(classification_report(y_test, predictions_SVM))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76      1750
           1       0.76      0.74      0.75      1750

    accuracy                           0.75      3500
   macro avg       0.75      0.75      0.75      3500
weighted avg       0.75      0.75      0.75      3500

