In [1]:
import pandas as pd
import numpy as np
import csv
import re
from ast import literal_eval
import time

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

In [3]:
FILE_PATH_CAT = 'F:/wikipedia-data/outputs/categories.csv'
FILE_PATH_ART = 'F:/wikipedia-data/outputs/articles_without_links.csv'

### Category data to vertex dataframe input

In [4]:
# id = page name
# type = Category
# depth = null

In [5]:
%%time
cat_origData = pd.read_csv(FILE_PATH_CAT)

Wall time: 3.52 s


In [6]:
%%time
cat_vertex = pd.DataFrame(columns=["id", "wikipedia-id", "type", "depth"])

cat_vertex["id"] = cat_origData['title'].apply(lambda x: str(x)[9:])
cat_vertex["wikipedia-id"] = cat_origData['id'].apply(lambda x: str(x))
cat_vertex["type"] = "Category"

Wall time: 1.63 s


In [7]:
cat_vertex.to_csv('F:/wikipedia-data/outputs/for-graphframes/cat_vertex.csv', index = False)

In [8]:
cat_vertex

Unnamed: 0,id,wikipedia-id,type,depth
0,Futurama,690070,Category,
1,World War II,690451,Category,
2,Programming languages,690571,Category,
3,Professional wrestling,690578,Category,
4,Algebra,690637,Category,
...,...,...,...,...
1951052,Australian expatriates in Estonia,64148944,Category,
1951053,Buildings and structures in Bhola District,64148963,Category,
1951054,Education in Bhola District,64148965,Category,
1951055,Colleges in the territories of Canada,64149140,Category,


### Article data to vertex dataframe input

In [9]:
# id = page name
# type = Article
# depth = null

In [10]:
%%time
art_origData = pd.read_csv(FILE_PATH_ART)

Wall time: 13.7 s


In [11]:
%%time
art_vertex = pd.DataFrame(columns=["id", "wikipedia-id", "type", "depth"])

art_vertex["id"] = art_origData['title'].apply(lambda x: str(x))
art_vertex["wikipedia-id"] = art_origData['id'].apply(lambda x: str(x))
art_vertex["type"] = "Article"

Wall time: 4.06 s


In [12]:
art_vertex.to_csv('F:/wikipedia-data/outputs/for-graphframes/art_vertex.csv', index = False)

In [13]:
art_vertex

Unnamed: 0,id,wikipedia-id,type,depth
0,Anarchism,12,Article,
1,Autism,25,Article,
2,Albedo,39,Article,
3,A,290,Article,
4,Alabama,303,Article,
...,...,...,...,...
6092876,Cardiff Australian Football Club,64149108,Article,
6092877,Haapsalu-Noarootsi Wetland Complex,64149136,Article,
6092878,El Caballero Country Club,64149147,Article,
6092879,Maro Markarian,64149155,Article,


### Category data to edge dataframe input

In [14]:
# src = from partentCategory column
# dst = from title columns
# intersection = null
# union = null
# similarity = null

In [15]:
'''%%time
# Approach 1
# 10000 rows in 213 seconds --> 4M rows in ~23h

start_time = time.time()
cat_edge = pd.DataFrame(columns=["src", "dst", "intersection", "union", "similarity"])

for row in cat_origData.index:
    if (row % 10000) == 0:
        print(str(row) + " : " + str(time.time() - start_time))
    
    dst = str(cat_origData.loc[row, "title"])[9:]
    
    #read parentCategories string as python list, iterate over each item
    for item in literal_eval(cat_origData.loc[row, "parentCategories"]):
        #add new row to cat_edge
        newRow = {"src": str(item) , "dst" : dst}
        cat_edge  = cat_edge.append(newRow, ignore_index = True)'''

'%%time\n# Approach 1\n# 10000 rows in 213 seconds --> 4M rows in ~23h\n\nstart_time = time.time()\ncat_edge = pd.DataFrame(columns=["src", "dst", "intersection", "union", "similarity"])\n\nfor row in cat_origData.index:\n    if (row % 10000) == 0:\n        print(str(row) + " : " + str(time.time() - start_time))\n    \n    dst = str(cat_origData.loc[row, "title"])[9:]\n    \n    #read parentCategories string as python list, iterate over each item\n    for item in literal_eval(cat_origData.loc[row, "parentCategories"]):\n        #add new row to cat_edge\n        newRow = {"src": str(item) , "dst" : dst}\n        cat_edge  = cat_edge.append(newRow, ignore_index = True)'

In [None]:
'''%%time
# Approach 2
# 800 000 rows in ~8 hours...

start_time = time.time()
cat_edge = pd.DataFrame(columns=["src", "dst"])

for row in cat_origData.index:
    if (row % 100000) == 0:
        print(str(row) + " : " + str(time.time() - start_time))
    
    toAdd = pd.DataFrame(columns=["src", "dst"])
    toAdd["src"] = literal_eval(cat_origData.loc[row, "parentCategories"])
    toAdd["dst"] = str(cat_origData.loc[row, "title"])[9:]
    
    cat_edge  = cat_edge.append(toAdd, ignore_index = True)
    '''

0 : 0.00797724723815918
100000 : 715.0738883018494
200000 : 2312.3534705638885
300000 : 4725.00062084198
400000 : 7987.58896780014
500000 : 12013.982226371765
600000 : 16909.419238328934
700000 : 22604.602380037308
800000 : 28960.55455994606


In [None]:
%%time
# Approach 3


In [None]:
'''%%time
# Approach 3
# 10000 rows in 50 seconds

start_time = time.time()
cat_edge = pd.DataFrame(columns=["src", "dst"])

for row in cat_origData.index:
    if (row % 10000) == 0:
        print(str(row) + " : " + str(time.time() - start_time))
    
    toAdd = pd.DataFrame(columns=["src"])
    toAdd["src"] = literal_eval(cat_origData.loc[row, "parentCategories"])
    # toAdd["dst"] = str(cat_origData.loc[row, "title"])[9:]
    
    cat_edge  = cat_edge.append(toAdd, ignore_index = True)
    '''

In [None]:
%%time
cat_edge["intersection"] = np.nan
cat_edge["union"] = np.nan
cat_edge["similarity"] = np.nan

In [None]:
%%time
cat_edge.to_csv('F:/wikipedia-data/outputs/for-graphframes/cat_edge.csv', index = False)

In [None]:
cat_edge