In [36]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [22]:
# upload corpus csv to blob in Azure, then get secure access signature URL and paste here
df = pd.read_csv("https://aiahackathon8578396330.blob.core.windows.net/azureml-blobstore-4ee7fd97-5c55-4512-a96d-0e7296125f80/corpus_prep1.csv?st=2019-01-29T13%3A08%3A01Z&se=2020-01-30T13%3A08%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=POuQmRYjVkYzv00c0kfNjPPn5qQ90HuaRDKn%2BE46lIc%3D")
df.head()

Unnamed: 0.1,Unnamed: 0,partyid.ym,textline
0,1,51110_201505,for the common good this is the manifesto of t...
1,2,51110_201706,the green party for a confident and caring bri...
2,3,51210_201505,2015 westminster election manifesto equality n...
3,4,51210_201706,designated special status within the european ...
4,5,51320_196410,the new britain the world wants it and woul...


In [24]:
# get the corpus as a list of strings
corpus = list(df.textline)

# create vectorizer (hyper-parameters to be optimized)
vectorizer = CountVectorizer(decode_error='ignore', ngram_range=(1,2), max_df=0.9, min_df=0.01, binary=False)

# convert the corpus to integer features using the vectorizer
X = vectorizer.fit_transform(corpus)
X.shape

(61, 273895)

In [25]:
# run SVD (i.e., PCA without centering) (hyper-parameters to be optimized)
svd = TruncatedSVD(n_components=10)
svd.fit(X)
print(svd.explained_variance_ratio_.sum())

0.6231446268749045


In [26]:
# k-means cluster the transformed data (hyper-parameters to be optimized)
Xt = svd.transform(X)
kmeans = KMeans(n_clusters=3, random_state=0).fit(Xt)
d = {'partyid_ym': df['partyid.ym'], 'cluster': list(kmeans.labels_)}
dfc = pd.DataFrame(d)
dfc.head()

Unnamed: 0,cluster,partyid_ym
0,0,51110_201505
1,2,51110_201706
2,2,51210_201505
3,2,51210_201706
4,2,51320_196410


In [14]:
# get party names
parties_df = pd.read_csv("https://aiahackathon8578396330.blob.core.windows.net/azureml-blobstore-4ee7fd97-5c55-4512-a96d-0e7296125f80/MPDataset_MPDS2018b.csv?st=2019-01-29T13%3A19%3A11Z&se=2020-01-30T13%3A19%3A00Z&sp=rl&sv=2018-03-28&sr=b&sig=quvd6dM0k2QcgTFahGqKVWAnfmtZDcDAGvPZXHqUUQg%3D")
parties_df = parties_df[['date', 'party', 'partyname']]
parties_df['partyid_ym'] = parties_df.apply(lambda x: str(x.party) + '_' + str(x.date), axis=1)
parties_df.head()

Unnamed: 0,date,party,partyname,partyid_ym
0,194409,11220,Communist Party of Sweden,11220_194409
1,194409,11320,Social Democratic Labour Party,11320_194409
2,194409,11420,People’s Party,11420_194409
3,194409,11620,Right Party,11620_194409
4,194409,11810,Agrarian Party,11810_194409


In [34]:
# join cluster labels to party names to sense check
dfc.set_index('partyid_ym').join(parties_df[['partyname', 'partyid_ym']].set_index('partyid_ym'), lsuffix='l', rsuffix='r')

Unnamed: 0_level_0,cluster,partyname
partyid_ym,Unnamed: 1_level_1,Unnamed: 2_level_1
51110_201505,0,Green Party of England and Wales
51110_201706,2,Green Party of England and Wales
51210_201505,2,We Ourselves
51210_201706,2,We Ourselves
51320_196410,2,Labour Party
51320_196603,2,Labour Party
51320_197006,2,Labour Party
51320_197402,2,Labour Party
51320_197410,2,Labour Party
51320_197905,2,Labour Party
