In [None]:
import pandas as pd
import numpy as np
import csv
import re
from ast import literal_eval
import time

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

### Import data from csv:s

In [None]:
FILE_PATH_CAT = 'F:/wikipedia-data/outputs/categories.csv'
FILE_PATH_ART = 'F:/wikipedia-data/outputs/articles_without_links.csv'

In [None]:
%%time
cat_origData = pd.read_csv(FILE_PATH_CAT)

In [None]:
%%time
# Make transformations to cat_origData

cat_origData['title'] = cat_origData['title'].apply(lambda x: str(x))
cat_origData['parentCategories'] = cat_origData['parentCategories'].apply(lambda x: literal_eval(x))

In [None]:
%%time
art_origData = pd.read_csv(FILE_PATH_ART)

In [None]:
%%time
# Make transformations on art_origData

art_origData['title'] = art_origData['title'].apply(lambda x: str(x))
art_origData['parentCategories'] = art_origData['parentCategories'].apply(lambda x: literal_eval(x))

### Create wikiID to title mapping Series

In [None]:
%%time
# set category name to wikipediaID mapping series

wikiId_mapping = cat_origData.drop(columns=['parentCategories'])
wikiId_mapping = wikiId_mapping.append(art_origData.drop(columns=['parentCategories']))
wikiId_mapping["title"] = wikiId_mapping["title"].apply(lambda x: str(x))
wikiId_mapping.reset_index(drop = True, inplace = True)


In [None]:
%%time
# Drop title = nan (~16 000 rows)

toDrop = wikiId_mapping.loc[wikiId_mapping["title"] == "nan", :].index
wikiId_mapping.drop(toDrop, inplace = True)

In [None]:
%%time
# Dropping duplicate values in title (4 rows droppes, last instance kept)

wikiId_mapping.drop_duplicates(subset = "title", keep = "last", inplace = True)

In [None]:
wikiId_mapping.set_index("title", inplace = True)
wikiId_mapping = wikiId_mapping.squeeze()

In [None]:
wikiId_mapping

### Function definition

In [None]:
# Define function for mapping wikipediaID

def tryIDmap(x):
    try:
        return wikiId_mapping[x]
    except:
        return "NO_WIKIPEDIA_ID"

### Category data to vertex dataframe input

In [None]:
# id = page name
# type = Category
# depth = null

In [None]:
%%time
# Replace based on transformed data

cat_vertex = pd.DataFrame(columns=["id", "wikipedia-id", "type", "depth"])

cat_vertex["id"] = cat_origData['title'].apply(lambda x: str(x))
cat_vertex["wikipedia-id"] = cat_origData['id'].apply(lambda x: str(x))
cat_vertex["type"] = "Category"

In [None]:
cat_vertex.to_csv('F:/wikipedia-data/outputs/for-graphframes/cat_vertex.csv', index = False)

In [None]:
cat_vertex

### Article data to vertex dataframe input

In [None]:
# id = page name
# type = Article
# depth = null

In [None]:
%%time
# Update based on above

art_vertex = pd.DataFrame(columns=["id", "wikipedia-id", "type", "depth"])

art_vertex["id"] = art_origData['title'].apply(lambda x: str(x))
art_vertex["wikipedia-id"] = art_origData['id'].apply(lambda x: str(x))
art_vertex["type"] = "Article"

In [None]:
art_vertex.to_csv('F:/wikipedia-data/outputs/for-graphframes/art_vertex.csv', index = False)

In [None]:
art_vertex

### Category data to edge dataframe input

In [None]:
# src = from partentCategory column
# dst = from title columns
# intersection = null
# union = null
# similarity = null

In [None]:
# Explode dataframe
# Make string adjustments
# Add wikipedia ids

In [None]:
%%time
# Explode dataframe
catcat_edge = cat_origData.explode("parentCategories")

In [None]:
# rename columns: title -> dst ; parentCategories -> src ; id -> dst-wikipediaID
catcat_edge.rename(columns={'title':'dst', 'parentCategories':'src', 'id':'dst-wikipediaID'}, inplace=True)

In [None]:
%%time
# String adjustments

catcat_edge['src'] = catcat_edge['src'].apply(lambda x: re.sub("(\|)(.+)", '', str(x)))
catcat_edge['src'] = catcat_edge['src'].apply(lambda x: re.sub("(\|)", '', str(x)))
catcat_edge['src'] = catcat_edge['src'].apply(lambda x: "Category:" + str(x))


In [None]:
%%time
# Drop rows with 'src' = 'Category:nan' i.e. those that have no parent categories

toDrop = catcat_edge.loc[catcat_edge["src"] == "Category:nan", :].index
catcat_edge.drop(toDrop, inplace = True)

In [None]:
%%time
# add src-wikipediaID

catcat_edge["src-wikipediaID"] = catcat_edge["src"].apply(lambda x: tryIDmap(str(x)))

In [None]:
%%time
catcat_edge["type"] = "cat2cat"
catcat_edge["intersection"] = np.nan
catcat_edge["union"] = np.nan
catcat_edge["similarity"] = np.nan

In [None]:
%%time
# Rearrange column order

catcat_edge = catcat_edge[["src", "dst", "src-wikipediaID", "dst-wikipediaID", "type", "intersection", "union", "similarity"]]
catcat_edge.reset_index(drop = True, inplace = True)

In [None]:
catcat_edge

In [None]:
# wikiId_mapping["Category:Main topic classifications"]

In [None]:
%%time
catcat_edge.to_csv('F:/wikipedia-data/outputs/for-graphframes/catcat_edge.csv', index = False)

### Article to category data to edge dataframe input

In [None]:
# analogous to above

# Explode dataframe
# Make string adjustments - also #
# Add wikipedia ids

In [None]:
%%time
# Explode dataframe
catart_edge = art_origData.explode("parentCategories")

In [None]:
# rename columns: title -> dst ; parentCategories -> src ; id -> dst-wikipediaID
catart_edge.rename(columns={'title':'dst', 'parentCategories':'src', 'id':'dst-wikipediaID'}, inplace=True)

In [None]:
%%time
# String adjustments

catart_edge['src'] = catart_edge['src'].apply(lambda x: "Category:" + str(x))

catart_edge['src'] = catart_edge['src'].apply(lambda x: re.sub("(\|)(.+)", '', str(x)))
catart_edge['src'] = catart_edge['src'].apply(lambda x: re.sub("(\|)", '', str(x)))

catart_edge['src'] = catart_edge['src'].apply(lambda x: re.sub("(\#)(.+)", '', str(x)))
catart_edge['src'] = catart_edge['src'].apply(lambda x: re.sub("(\#)", '', str(x)))

In [None]:
%%time
# Drop rows with 'src' = 'Category:nan' i.e. those that have no parent categories

toDrop = catart_edge.loc[catart_edge["src"] == "Category:nan", :].index
catart_edge.drop(toDrop, inplace = True)

In [None]:
%%time
# add src-wikipediaID

catart_edge["src-wikipediaID"] = catart_edge["src"].apply(lambda x: tryIDmap(str(x)))

In [None]:
%%time
catart_edge["type"] = "cat2art"
catart_edge["intersection"] = np.nan
catart_edge["union"] = np.nan
catart_edge["similarity"] = np.nan

In [None]:
%%time
# Rearrange column order

catart_edge = catart_edge[["src", "dst", "src-wikipediaID", "dst-wikipediaID", "type", "intersection", "union", "similarity"]]
catart_edge.reset_index(drop = True, inplace = True)

In [None]:
%%time
catart_edge

In [None]:
%%time
catart_edge.to_csv('F:/wikipedia-data/outputs/for-graphframes/catart_edge.csv', index = False)

### TBD - Article to article links data to edge dataframe input

In [None]:
# analogous to above

# Necessary string edits
## Remove all entries containing :
## Some links are to redirects --> need to be connected to article
## Remove all after pipe and #


### Append vertex and edge dataframes to csvs

In [None]:
%%time
all_vertex = cat_vertex.append(art_vertex, ignore_index = True)

In [None]:
all_vertex

In [None]:
%%time
all_edge = catcat_edge.append(catart_edge, ignore_index = True)

In [None]:
all_edge

In [None]:
%%time
all_vertex.to_csv('F:/wikipedia-data/outputs/for-graphframes/all_vertex.csv', index = False)
all_edge.to_csv('F:/wikipedia-data/outputs/for-graphframes/all_edge.csv', index = False)