In [1]:
# import pandas library to read data from files and to create data frames
import pandas as pd
# import plot library
import matplotlib.pyplot as plt

from scipy import cluster

# import the PCA model.
from sklearn.decomposition import PCA

# Import the kmeans clustering model.
from sklearn.cluster import KMeans

# We mapped the data using force-directed graph
# Here is a link to an example: https://bl.ocks.org/mbostock/2675ff61ea5e063ede2b5d63c08020c7
# I saved the x and y axis of each node to a csv file after the mapping
# Nodes here are keywords.

# read the x,y locations of the keywords from csv file to a dataframe
locs = pd.read_csv('data_files/afterMappingFinal.csv',sep=';')
print(locs.columns)
print(locs.shape)

Index([u'id', u'x', u'y'], dtype='object')
(72191, 3)


In [3]:
# Let's plot the keywords to see visually how they are clustered
# I made many plotting trials to find the best way to show how these keywrods are mapped.

# You can find my plotting trials in the file called: "PlotLargeData.ipynb"
# Finally I found the best way to plot. But the file is too large.
# I advise you to check it on your browser after you save the image

fig = plt.figure(figsize=(100,100))
ax = fig.add_subplot(111)
ax.scatter(locs.iloc[:,1:2], locs.iloc[:,2:3],c='black', marker='o', s=1)
fig.savefig('plots/nicePlot.png', dpi = 100)

In [5]:
# In this point I should decide how many clusters I should create.
# From my trials in other files I realized that using any traditional method
# to decide on the number of clusters will not work in my case because
# keywords are not perfectly clustered. You can check the plot to see.

# Hence I decided to set the number of clusters intuitively.
# I found out that more than 20 keywords cannot be completely related so
# I don't want more than 20 keywords in a cluster
# As a result I decided to create enough clusters so that the average number of
# keywords in a cluster will be around 15
# We have 72191 keywords.
# For 15 keyowrds in each cluster we need around 72191 / 15 ~= 4813

In [6]:
# We will create 4813 clusters...

km = KMeans(n_clusters=4813,
            init='random', 
            n_init=10, 
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(locs.iloc[:,1:3])

# Get the cluster assignments.
labels = km.labels_

In [14]:
# The computation for the clustering took a few hours.
# Just a note.

In [7]:
# Let's plot the result.
fig = plt.figure(figsize=(100,100))
ax = fig.add_subplot(111)
ax.scatter(locs.iloc[:,1:2], locs.iloc[:,2:3],c=labels)
fig.savefig('plots/largeClustered2.png', dpi = 100)

In [8]:
# I want to check the keywords themselves from a cluster
# to see if it's meaningful to put them in the same cluster
# Let's get keyowords cluster number 13 for example
# Let me test a code first
locsSub = locs[y_km[:] == 13]
print(locsSub.columns)
print(locsSub.shape)

Index([u'id', u'x', u'y'], dtype='object')
(14, 3)


In [10]:
# According to the result we have 14 keywords in cluster #13
# Variable locsSub have their ids
# let's check them from our list
# read the x,y locations of the keywords from csv file to a dataframe
entities = pd.read_csv('data_files/entityFinal.csv',sep=';')
print(entities.columns)
print(entities.shape)

Index([u'id', u'name', u'date_added', u'entity_type_id'], dtype='object')
(72888, 4)


In [12]:
# We load all the keywords we have now let's print them according to the ids we have in loc13
for x in range(0, locsSub.shape[0]):
    #print entities[entities['id']==locs13.iloc[x]['id']]['name']
    print entities[entities['id']==locsSub.iloc[x]['id']]['name'].iloc[0]

Doug McGahn
Will Laska
Technology Directorate
mass-transit systems
commercial producer
National Association of Truck Stop Operators
Truck Stop Operators
natural gas pipeline industry
Regulatory Policy Officer
federal government authority
North Dakota Petroleum Council
Sunoco Logistics Partners
Eric Danziger
hotel business


In [13]:
# You can check the keywords whether they have a connection
# If you want to check any cluster you can just change the number below.
# Keep in mind there are 4813 clusters
locsSub = locs[y_km[:] == 569]
for x in range(0, locsSub.shape[0]):
    #print entities[entities['id']==locs13.iloc[x]['id']]['name']
    print entities[entities['id']==locsSub.iloc[x]['id']]['name'].iloc[0]

London
Antarctic
The Conversation
the 2013 Boston Marathon
Faye Dunaway
South America
John
Federal Defenders Office in New York
additional counsel
Donner Party
Sympathy for the Devil
Winchester
Director winner
first Treasury secretary
Queensland
coal-mining
Kayla
Nikki Mendicino
Paul
Revere
Artie M. Muller
WOW!
