In [1]:
import pandas as pd
import numpy as np
import csv
import re
from ast import literal_eval
import time

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

### Import data from csv:s

In [3]:
FILE_PATH_CAT = 'F:/wikipedia-data/outputs/categories.csv'
FILE_PATH_ART = 'F:/wikipedia-data/outputs/articles_without_links.csv'

In [4]:
%%time
cat_origData = pd.read_csv(FILE_PATH_CAT)

Wall time: 3.91 s


In [5]:
%%time
# Make transformations to cat_origData

cat_origData['title'] = cat_origData['title'].apply(lambda x: str(x))
cat_origData['parentCategories'] = cat_origData['parentCategories'].apply(lambda x: literal_eval(x))

Wall time: 23.5 s


In [6]:
%%time
art_origData = pd.read_csv(FILE_PATH_ART)

Wall time: 16.1 s


In [7]:
%%time
# Make transformations on art_origData

art_origData['title'] = art_origData['title'].apply(lambda x: str(x))
art_origData['parentCategories'] = art_origData['parentCategories'].apply(lambda x: literal_eval(x))

Wall time: 1min 53s


### Create wikiID to title mapping Series

In [8]:
%%time
# set category name to wikipediaID mapping series

wikiId_mapping = cat_origData.drop(columns=['parentCategories'])
wikiId_mapping = wikiId_mapping.append(art_origData.drop(columns=['parentCategories']))
wikiId_mapping["title"] = wikiId_mapping["title"].apply(lambda x: str(x))
wikiId_mapping.reset_index(drop = True, inplace = True)


Wall time: 2.5 s


In [9]:
%%time
# Drop title = nan (~16 000 rows)

toDrop = wikiId_mapping.loc[wikiId_mapping["title"] == "nan", :].index
wikiId_mapping.drop(toDrop, inplace = True)

Wall time: 977 ms


In [10]:
%%time
# Dropping duplicate values in title (4 rows droppes, last instance kept)

wikiId_mapping.drop_duplicates(subset = "title", keep = "last", inplace = True)

Wall time: 5.45 s


In [11]:
wikiId_mapping.set_index("title", inplace = True)
wikiId_mapping = wikiId_mapping.squeeze()

In [12]:
wikiId_mapping

title
Category:Futurama                       690070
Category:World War II                   690451
Category:Programming languages          690571
Category:Professional wrestling         690578
Category:Algebra                        690637
                                        ...   
Cardiff Australian Football Club      64149108
Haapsalu-Noarootsi Wetland Complex    64149136
El Caballero Country Club             64149147
Maro Markarian                        64149155
Louis Édouard Paul Lieutard           64149158
Name: id, Length: 8027896, dtype: int64

### Function definition

In [13]:
# Define function for mapping wikipediaID

def tryIDmap(x):
    try:
        return wikiId_mapping[x]
    except:
        return "NO_WIKIPEDIA_ID"

### Category data to vertex dataframe input

In [14]:
# id = page name
# type = Category
# depth = null

In [15]:
%%time
# Replace based on transformed data

cat_vertex = pd.DataFrame(columns=["id", "wikipedia-id", "type", "depth"])

cat_vertex["id"] = cat_origData['title'].apply(lambda x: str(x))
cat_vertex["wikipedia-id"] = cat_origData['id'].apply(lambda x: str(x))
cat_vertex["type"] = "Category"

Wall time: 1.46 s


In [16]:
cat_vertex.to_csv('F:/wikipedia-data/outputs/for-graphframes/cat_vertex.csv', index = False)

In [17]:
cat_vertex

Unnamed: 0,id,wikipedia-id,type,depth
0,Category:Futurama,690070,Category,
1,Category:World War II,690451,Category,
2,Category:Programming languages,690571,Category,
3,Category:Professional wrestling,690578,Category,
4,Category:Algebra,690637,Category,
...,...,...,...,...
1951052,Category:Australian expatriates in Estonia,64148944,Category,
1951053,Category:Buildings and structures in Bhola District,64148963,Category,
1951054,Category:Education in Bhola District,64148965,Category,
1951055,Category:Colleges in the territories of Canada,64149140,Category,


### Article data to vertex dataframe input

In [18]:
# id = page name
# type = Article
# depth = null

In [19]:
%%time
# Update based on above

art_vertex = pd.DataFrame(columns=["id", "wikipedia-id", "type", "depth"])

art_vertex["id"] = art_origData['title'].apply(lambda x: str(x))
art_vertex["wikipedia-id"] = art_origData['id'].apply(lambda x: str(x))
art_vertex["type"] = "Article"

Wall time: 4.69 s


In [20]:
art_vertex.to_csv('F:/wikipedia-data/outputs/for-graphframes/art_vertex.csv', index = False)

In [21]:
art_vertex

Unnamed: 0,id,wikipedia-id,type,depth
0,Anarchism,12,Article,
1,Autism,25,Article,
2,Albedo,39,Article,
3,A,290,Article,
4,Alabama,303,Article,
...,...,...,...,...
6092876,Cardiff Australian Football Club,64149108,Article,
6092877,Haapsalu-Noarootsi Wetland Complex,64149136,Article,
6092878,El Caballero Country Club,64149147,Article,
6092879,Maro Markarian,64149155,Article,


### Category data to edge dataframe input

In [22]:
# src = from partentCategory column
# dst = from title columns
# intersection = null
# union = null
# similarity = null

In [23]:
# Explode dataframe
# Make string adjustments
# Add wikipedia ids

In [24]:
%%time
# Explode dataframe
catcat_edge = cat_origData.explode("parentCategories")

Wall time: 3.4 s


In [25]:
# rename columns: title -> dst ; parentCategories -> src ; id -> dst-wikipediaID
catcat_edge.rename(columns={'title':'dst', 'parentCategories':'src', 'id':'dst-wikipediaID'}, inplace=True)

In [26]:
%%time
# String adjustments

catcat_edge['src'] = catcat_edge['src'].apply(lambda x: re.sub("(\|)(.+)", '', str(x)))
catcat_edge['src'] = catcat_edge['src'].apply(lambda x: re.sub("(\|)", '', str(x)))
catcat_edge['src'] = catcat_edge['src'].apply(lambda x: "Category:" + str(x))


Wall time: 13.5 s


In [27]:
%%time
# Drop rows with 'src' = 'Category:nan' i.e. those that have no parent categories

toDrop = catcat_edge.loc[catcat_edge["src"] == "Category:nan", :].index
catcat_edge.drop(toDrop, inplace = True)

Wall time: 1.94 s


In [28]:
%%time
# add src-wikipediaID

catcat_edge["src-wikipediaID"] = catcat_edge["src"].apply(lambda x: tryIDmap(str(x)))

Wall time: 59.9 s


In [29]:
%%time
catcat_edge["type"] = "cat2cat"
catcat_edge["intersection"] = np.nan
catcat_edge["union"] = np.nan
catcat_edge["similarity"] = np.nan

Wall time: 70.8 ms


In [30]:
%%time
# Rearrange column order

catcat_edge = catcat_edge[["src", "dst", "src-wikipediaID", "dst-wikipediaID", "type", "intersection", "union", "similarity"]]
catcat_edge.reset_index(drop = True, inplace = True)

Wall time: 836 ms


In [31]:
catcat_edge

Unnamed: 0,src,dst,src-wikipediaID,dst-wikipediaID,type,intersection,union,similarity
0,Category:Television series by 20th Century Fox Television,Category:Futurama,6015549,690070,cat2cat,,,
1,Category:Television series created by Matt Groening,Category:Futurama,37480813,690070,cat2cat,,,
2,Category:Wikipedia categories named after American animated television series,Category:Futurama,35315704,690070,cat2cat,,,
3,Category:Wikipedia categories named after mass media franchises,Category:Futurama,55466012,690070,cat2cat,,,
4,Category:20th Century Fox Television franchises,Category:Futurama,60330380,690070,cat2cat,,,
...,...,...,...,...,...,...,...,...
4027786,Category:New Zealand news websites,Category:Stuff,57822700,64149156,cat2cat,,,
4027787,Category:Real estate in New Zealand,Category:Stuff,61657816,64149156,cat2cat,,,
4027788,Category:Companies based in Wellington,Category:Stuff,34064790,64149156,cat2cat,,,
4027789,Category:Newspaper companies of New Zealand,Category:Stuff,24103909,64149156,cat2cat,,,


In [32]:
# wikiId_mapping["Category:Main topic classifications"]

In [33]:
%%time
catcat_edge.to_csv('F:/wikipedia-data/outputs/for-graphframes/catcat_edge.csv', index = False)

Wall time: 26.4 s


### Article to category data to edge dataframe input

In [34]:
# analogous to above

# Explode dataframe
# Make string adjustments - also #
# Add wikipedia ids

In [35]:
%%time
# Explode dataframe
catart_edge = art_origData.explode("parentCategories")

Wall time: 15.2 s


In [36]:
# rename columns: title -> dst ; parentCategories -> src ; id -> dst-wikipediaID
catart_edge.rename(columns={'title':'dst', 'parentCategories':'src', 'id':'dst-wikipediaID'}, inplace=True)

In [37]:
%%time
# String adjustments

catart_edge['src'] = catart_edge['src'].apply(lambda x: "Category:" + str(x))

catart_edge['src'] = catart_edge['src'].apply(lambda x: re.sub("(\|)(.+)", '', str(x)))
catart_edge['src'] = catart_edge['src'].apply(lambda x: re.sub("(\|)", '', str(x)))

catart_edge['src'] = catart_edge['src'].apply(lambda x: re.sub("(\#)(.+)", '', str(x)))
catart_edge['src'] = catart_edge['src'].apply(lambda x: re.sub("(\#)", '', str(x)))

Wall time: 2min 43s


In [38]:
%%time
# Drop rows with 'src' = 'Category:nan' i.e. those that have no parent categories

toDrop = catart_edge.loc[catart_edge["src"] == "Category:nan", :].index
catart_edge.drop(toDrop, inplace = True)

Wall time: 10.6 s


In [39]:
%%time
# add src-wikipediaID

catart_edge["src-wikipediaID"] = catart_edge["src"].apply(lambda x: tryIDmap(str(x)))

Wall time: 7min 28s


In [40]:
%%time
catart_edge["type"] = "cat2art"
catart_edge["intersection"] = np.nan
catart_edge["union"] = np.nan
catart_edge["similarity"] = np.nan

Wall time: 501 ms


In [41]:
%%time
# Rearrange column order

catart_edge = catart_edge[["src", "dst", "src-wikipediaID", "dst-wikipediaID", "type", "intersection", "union", "similarity"]]
catart_edge.reset_index(drop = True, inplace = True)

Wall time: 10.4 s


In [42]:
%%time
catart_edge

Wall time: 0 ns


Unnamed: 0,src,dst,src-wikipediaID,dst-wikipediaID,type,intersection,union,similarity
0,Category:Anarchism,Anarchism,780754,12,cat2art,,,
1,Category:Anti-capitalism,Anarchism,22737025,12,cat2art,,,
2,Category:Anti-fascism,Anarchism,7252754,12,cat2art,,,
3,Category:Economic ideologies,Anarchism,3050197,12,cat2art,,,
4,Category:Far-left politics,Anarchism,15899799,12,cat2art,,,
...,...,...,...,...,...,...,...,...
30275292,Category:Armenian poets,Maro Markarian,2611240,64149155,cat2art,,,
30275293,Category:1915 births,Maro Markarian,915457,64149155,cat2art,,,
30275294,Category:1999 deaths,Maro Markarian,979445,64149155,cat2art,,,
30275295,Category:People from Marneuli,Maro Markarian,31643514,64149155,cat2art,,,


In [43]:
%%time
catart_edge.to_csv('F:/wikipedia-data/outputs/for-graphframes/catart_edge.csv', index = False)

Wall time: 3min 4s


### TBD - Article to article links data to edge dataframe input

In [44]:
# analogous to above

# Necessary string edits
## Remove all entries containing :
## Some links are to redirects --> need to be connected to article
## Remove all after pipe and #


### Append vertex and edge dataframes to csvs

In [45]:
%%time
all_vertex = cat_vertex.append(art_vertex, ignore_index = True)

Wall time: 1.34 s


In [46]:
all_vertex

Unnamed: 0,id,wikipedia-id,type,depth
0,Category:Futurama,690070,Category,
1,Category:World War II,690451,Category,
2,Category:Programming languages,690571,Category,
3,Category:Professional wrestling,690578,Category,
4,Category:Algebra,690637,Category,
...,...,...,...,...
8043933,Cardiff Australian Football Club,64149108,Article,
8043934,Haapsalu-Noarootsi Wetland Complex,64149136,Article,
8043935,El Caballero Country Club,64149147,Article,
8043936,Maro Markarian,64149155,Article,


In [47]:
%%time
all_edge = catcat_edge.append(catart_edge, ignore_index = True)

Wall time: 7.4 s


In [48]:
all_edge

Unnamed: 0,src,dst,src-wikipediaID,dst-wikipediaID,type,intersection,union,similarity
0,Category:Television series by 20th Century Fox Television,Category:Futurama,6015549,690070,cat2cat,,,
1,Category:Television series created by Matt Groening,Category:Futurama,37480813,690070,cat2cat,,,
2,Category:Wikipedia categories named after American animated television series,Category:Futurama,35315704,690070,cat2cat,,,
3,Category:Wikipedia categories named after mass media franchises,Category:Futurama,55466012,690070,cat2cat,,,
4,Category:20th Century Fox Television franchises,Category:Futurama,60330380,690070,cat2cat,,,
...,...,...,...,...,...,...,...,...
34303083,Category:Armenian poets,Maro Markarian,2611240,64149155,cat2art,,,
34303084,Category:1915 births,Maro Markarian,915457,64149155,cat2art,,,
34303085,Category:1999 deaths,Maro Markarian,979445,64149155,cat2art,,,
34303086,Category:People from Marneuli,Maro Markarian,31643514,64149155,cat2art,,,


In [49]:
%%time
all_vertex.to_csv('F:/wikipedia-data/outputs/for-graphframes/all_vertex.csv', index = False)
all_edge.to_csv('F:/wikipedia-data/outputs/for-graphframes/all_edge.csv', index = False)

Wall time: 3min 49s
