In [1]:
# import pandas library to read data from files and to create data frames
import pandas as pd

# read the entitiy data from csv file to a dataframe
entities = pd.read_csv('data_files/entityFinal.csv',sep=';')

# print the column names and the dataframe dimensions
print(entities.columns)
print(entities.shape)

# print 8 samples from the data
entities.sample(8)

Index([u'id', u'name', u'date_added', u'entity_type_id'], dtype='object')
(72888, 4)


Unnamed: 0,id,name,date_added,entity_type_id
30172,30173,HKS Inc.,2017-04-25 05:50:44,1
13576,13577,Jay McInerney,2017-04-24 23:34:06,2
4365,4366,Jeff Kaufmann,2017-04-14 12:15:17,2
53577,53578,western Syrian Arab Republic,2017-04-25 22:32:36,28
43006,43007,California Republican Party,2017-04-25 10:18:19,8
57497,57498,Veterans’ Affairs Committee,2017-04-26 07:48:03,8
17328,17329,Misha Collins,2017-04-25 01:54:22,2
29022,29023,Issa Arnita,2017-04-25 05:28:03,2


In [2]:
# read the entitiy-article relationship data from csv file to a dataframe
entityRel = pd.read_csv('data_files/entityRelFinal.csv',sep=';')

# print the column names and the dataframe dimensions
print(entityRel.columns)
print(entityRel.shape)

# print 8 samples from the data
entityRel.sample(8)

Index([u'id', u'relevance', u'date_added', u'article_id', u'entity_id'], dtype='object')
(279719, 5)


Unnamed: 0,id,relevance,date_added,article_id,entity_id
165575,165576,0.8,2017-04-25 10:47:13,7343,268
95226,95227,0.2,2017-04-25 04:58:18,4292,2224
188227,188228,0.2,2017-04-25 16:18:08,8288,12315
221428,221429,0.2,2017-04-26 07:37:31,9814,427
85055,85056,0.2,2017-04-25 04:09:53,3862,24124
258857,258858,0.8,2017-04-27 06:49:38,11561,8
180748,180749,0.2,2017-04-25 15:01:22,8018,46888
65932,65933,0.2,2017-04-25 02:31:22,2967,266


In [3]:
# Get rid of the keyword-article relations that has 0 relevance.
# We do that because 0 relevance means we're not sure at all whether that keyword has a relationship with the keyword or not

entityRelNotZero = entityRel[entityRel['relevance'] > 0]

# print the column names and the dataframe dimensions
print(entityRelNotZero.columns)
print(entityRelNotZero.shape)

Index([u'id', u'relevance', u'date_added', u'article_id', u'entity_id'], dtype='object')
(273058, 5)


In [4]:
# There were many rows with zero relevance
# 279719 - 273058 = 6661 relavance relations were 0

# We have all the relationships between keywords and articles.
# We need to create a 2D map of the keywords from these relations
# If we can achieve creating a map, then we can easily cluster the keywords using K-means.

# I decided to use a library called d3-force library.
# It illustrates a physiscs rule from the links between points.
# To use d3-force library I need to create a link-table between the keywords.
# Keywords will be nodes in this table.

# We need to create a json file similar to the following:
from IPython.core.display import display, HTML
display(HTML('<img src="http://i.imgur.com/BMDxwNW.png">'))




In [5]:
# Let's start creating a json similar to this from our data
# First we need to create a "source to target" link between keywords.
# As I said before all the keywrods from one article will be linked.
# I will take the first keyword from each article to start with
# Then link the keywords in a recursive order.

# Testing code before creating the links

In [6]:
# This is how we will get the keywords from the same article
similarKeywords = entityRel[entityRel['article_id'] == 1]
similarKeywords.sample(5)

Unnamed: 0,id,relevance,date_added,article_id,entity_id
14,15,0.2,2017-04-14 19:18:30,1,1025
8,9,0.2,2017-04-14 19:18:29,1,1019
7,8,0.2,2017-04-14 19:18:29,1,1018
9,10,0.2,2017-04-14 19:18:30,1,1020
10,11,0.2,2017-04-14 19:18:30,1,1021


In [13]:
# Then we will create a source and destination table from the similar keywords.

print similarKeywords.shape[0]

15


In [21]:
# I will create list of dictionaries, this way will be easier to create a json file
# let's test it first
# I will choose a random keyword from one article and connect it to all other keywords from the same article
# So our "source" will be the randomly choosen keyword
# and the "target" will be other keywords in the same article

# For the "value" I decided to multiply target and source keywords' relevances.
# By the way "value" represents the relationship strength between two keywords.
keywordsList = []
keywordsList.append({'source': entityRelNotZero.iloc[0]['entity_id'], 'target': entityRelNotZero.iloc[1]['entity_id'], 'value': int(entityRelNotZero.iloc[0]['relevance']*entityRelNotZero.iloc[1]['relevance']*100)})
keywordsList.append({'source': entityRelNotZero.iloc[0]['entity_id'], 'target': entityRelNotZero.iloc[2]['entity_id'], 'value': int(entityRelNotZero.iloc[0]['relevance']*entityRelNotZero.iloc[2]['relevance']*100)})

print keywordsList

[{'source': 1013, 'target': 1014, 'value': 4}, {'source': 1013, 'target': 1015, 'value': 4}]


In [22]:
# This result looks more like what I want for the json file "nodes and links"
# So I will use dictionaries list to create the links

# Actual Code for the link between keyowords

In [23]:
keywordsList = []

# We have 12297 different article for the current data so we will run a for loop 908 times

for num in range(1, 12298):
    # This is how we will get the keywords from the same article
    similarKeywords = entityRelNotZero[entityRelNotZero['article_id'] == num]

    # create a link between all keyword from the same article
    for x in range(1, similarKeywords.shape[0]):
        keywordsList.append({'source': similarKeywords.iloc[0]['entity_id'], 'target': similarKeywords.iloc[x]['entity_id'], 'value': int(similarKeywords.iloc[0]['relevance']*similarKeywords.iloc[x]['relevance']*100)})
            

In [24]:
# The code above is runned for around 6 minutes.
# Just a note

In [25]:
# write all the relationships to a csv file
# we will use this file later on to create dataframe
keywordsLinksCSV = open('data_files/keywordsLinksFinal.csv', 'w')

keywordsLinksCSV.write('"source";"target";"value"\n')
for item in keywordsList:
    keywordsLinksCSV.write(str(item['source']) + ';' + str(item['target']) + ';' + str(item['value']) + '\n')

In [26]:
# read the csv for links between keywords
linksBetweenKeywords = pd.read_csv('data_files/keywordsLinksFinal.csv',sep=';')

# print the column names and the dataframe dimensions
print(linksBetweenKeywords.columns)
print(linksBetweenKeywords.shape)

Index([u'source', u'target', u'value'], dtype='object')
(260716, 3)


In [27]:
# create keyword nodes
# different articles may include same keywords
# we need to be sure that we don't repeat the same keyword as node
# so we use sets to add keywords
keywordNodes = set([])
for x in range(0, entityRelNotZero.shape[0]):
    keywordNodes.add(entityRelNotZero.iloc[x]['entity_id'])

In [29]:
# Create JSON file for nodes and links
keywordsLinksJSON = open('data_files/keywordsLinksFinal.json', 'w')

# Add nodes to json file
keywordsLinksJSON.write('{\n')
keywordsLinksJSON.write('"nodes":[\n')

for node in keywordNodes:
    keywordsLinksJSON.write('{"id": ' + str(node) + ', "group": ' + str(1) + '},\n')

keywordsLinksJSON.write('],\n')
keywordsLinksJSON.write('"links": [\n')

for item in keywordsList:
    keywordsLinksJSON.write('{"source": ' + str(item['source']) + ', "target": ' + str(item['target']) + ', "value": ' + str(item['value']) + '},\n')

keywordsLinksJSON.write(']\n')
keywordsLinksJSON.write('}\n')

In [30]:
# We are done here.
# The next step will be mapping the keywords on a plane. 2 dimentions--
# We will use 3d-force graph algorithm

### Check the file called "2 - After Mapping.ipynb" to follow the next step.