In [8]:
# import pandas library to read data from files and to create data frames
import pandas as pd

# read the entitiy data from csv file to a dataframe
entities = pd.read_csv('data_files/entity.csv',sep=';')

# print the column names and the dataframe dimensions
print(entities.columns)
print(entities.shape)

# print 8 samples from the data
entities.sample(8)

Index([u'id', u'name', u'date_added', u'entity_type_id'], dtype='object')
(7416, 4)


Unnamed: 0,id,name,date_added,entity_type_id
4449,4450,Time Inc.,2017-04-14 12:17:15,1
2223,2224,Beverly Hallberg,2017-04-14 11:35:50,2
4330,4331,Republican senator,2017-04-14 12:14:13,3
3938,3939,Fort Hood,2017-04-14 12:06:13,13
216,217,McDaniels Knopfler,2017-02-01 14:20:43,2
2293,2294,administration official,2017-04-14 11:37:23,3
6817,6818,Egyptian Muslim Brotherhood,2017-04-14 12:56:23,8
6696,6697,democratically elected president,2017-04-14 12:54:48,3


In [9]:
# read the entitiy-article relationship data from csv file to a dataframe
entityRel = pd.read_csv('data_files/entityRel.csv',sep=';')

# print the column names and the dataframe dimensions
print(entityRel.columns)
print(entityRel.shape)

# print 8 samples from the data
entityRel.sample(8)

Index([u'id', u'relevance', u'date_added', u'article_id', u'entity_id'], dtype='object')
(19377, 5)


Unnamed: 0,id,relevance,date_added,article_id,entity_id
1716,1717,0.2,2017-04-14 11:29:46,112,1876
1965,1966,0.2,2017-04-14 11:31:17,124,1946
4121,4122,0.2,2017-04-14 11:42:43,225,698
8383,8384,0.2,2017-04-14 12:04:28,397,3809
3815,3816,0.8,2017-04-14 11:40:55,209,649
9958,9959,0.2,2017-04-14 12:14:18,476,4338
1370,1371,0.2,2017-04-14 11:28:10,97,457
18278,18279,0.2,2017-04-14 13:03:11,840,1625


In [10]:
# Get rid of the keyword-article relations that has 0 relevance.
# We do that because 0 relevance means we're not sure at all whether that keyword has a relationship with the keyword or not

entityRelNotZero = entityRel[entityRel['relevance'] > 0]
entityRelNotZero

Unnamed: 0,id,relevance,date_added,article_id,entity_id
1,2,0.2,2017-04-14 19:18:28,1,1013
2,3,0.2,2017-04-14 19:18:29,1,1014
3,4,0.2,2017-04-14 19:18:29,1,1015
4,5,0.2,2017-04-14 19:18:29,1,119
5,6,0.2,2017-04-14 19:18:29,1,1016
6,7,0.2,2017-04-14 19:18:29,1,1017
7,8,0.2,2017-04-14 19:18:29,1,1018
8,9,0.2,2017-04-14 19:18:29,1,1019
9,10,0.2,2017-04-14 19:18:30,1,1020
10,11,0.2,2017-04-14 19:18:30,1,1021


In [14]:
# There were many zero relevance lines
# 19377 - 18849 = 528 relavance relations were 0

# We have all the relationships between keywords and articles.
# We need to create a 2D map of the keywords from these relations
# If we can achieve creating a map, then we can easily cluster the keywords using K-means.

# I decided to use a library called d3-force library.
# It illustrates a physiscs rule from the links between points.
# To use d3-force library I need to create a link-table between the keywords.
# Keywords will be nodes in this table.

# We need to create a json file similar to the following:
from IPython.core.display import display, HTML
display(HTML('<img src="http://i.imgur.com/BMDxwNW.png">'))




In [42]:
# Let's start creating a json similar to this from our data
# First we need to create a "source to target" link between keywords.
# As I said before all the keywrods from one article will be linked.
# I will take the first keyword from each article to start with
# Then link the keywords in a recursive order.

# Testing code before creating the links

In [43]:
# This is how we will get the keywords from the same article
similarKeywords = entityRel[entityRel['article_id'] == 1]
similarKeywords.sample(5)

Unnamed: 0,id,relevance,date_added,article_id,entity_id
11,12,0.2,2017-04-14 19:18:30,1,1022
5,6,0.2,2017-04-14 19:18:29,1,1016
6,7,0.2,2017-04-14 19:18:29,1,1017
9,10,0.2,2017-04-14 19:18:30,1,1020
14,15,0.2,2017-04-14 19:18:30,1,1025


In [20]:
# Then we will create a source and destination table from the similar keywords.

print similarKeywords.shape[0]

15


In [35]:
# get the keywords only
keywords = similarKeywords.ix[:,4:5]
keywords

Unnamed: 0,entity_id
0,30
1,1013
2,1014
3,1015
4,119
5,1016
6,1017
7,1018
8,1019
9,1020


In [52]:
# creating an array from the dataframe
keywordsArray = keywords.as_matrix()
print keywordsArray
print
print keywordsArray[3][0]
print
print len(keywordsArray)

[[  30]
 [1013]
 [1014]
 [1015]
 [ 119]
 [1016]
 [1017]
 [1018]
 [1019]
 [1020]
 [1021]
 [1022]
 [1023]
 [1024]
 [1025]]

1015

15


In [41]:
# checking the array if it works
for num in range(0, len(keywordsArray)):
    print keywordsArray[num]

[30]
[1013]
[1014]
[1015]
[119]
[1016]
[1017]
[1018]
[1019]
[1020]
[1021]
[1022]
[1023]
[1024]
[1025]


In [57]:
# tring to append data to an empty dataframe
sourceDestination = pd.DataFrame()
sourceDestination = sourceDestination.append({'source': keywords.iloc[0]['entity_id'], 'target': keywordsArray[1][0]}, ignore_index=True)
sourceDestination = sourceDestination.append({'source': keywordsArray[0][0], 'target': keywordsArray[2][0]}, ignore_index=True)

sourceDestination

Unnamed: 0,destination,source
0,1013.0,30.0
1,1014.0,30.0


In [60]:
# I didn't like the result form the "append to dataframe"
# I want to try something different
# I will create list of dictionaries instead of using dataframe
keywordsList = []
keywordsList.append({'source': keywords.iloc[0]['entity_id'], 'target': keywords.iloc[1]['entity_id']})
keywordsList.append({'source': keywords.iloc[0]['entity_id'], 'target': keywords.iloc[2]['entity_id']})

print keywordsList

[{'source': 30, 'target': 1013}, {'source': 30, 'target': 1014}]


In [None]:
# This result looks more like what I want for the json file "nodes and links"
# So I will use dictionaries list to create the links

# Actual Code for the link between keyowords