In [3]:
import re
import pandas as pd
import csv

# Categories

## Space-delimited to comma-delimited

In [4]:
with open('original data/wiki-topcats-categories.txt') as file:
    with open('node_cats.csv', 'w', newline='') as csv_file:
        # initialize writer and write column names
        writer = csv.writer(csv_file)
        writer.writerow(['Id', 'Category'])
        
        lines = file.readlines()
        for line in lines:
            regex = re.search(r'^\w+:(?P<cat>[^;]+);\s(?P<IDs>[\d\s]+)', line)
            id_list = re.sub('\s', ',', regex.group('IDs')).split(',')
            
            for id in id_list:
                csv_format = [[id, regex.group('cat')]]
                writer.writerows(csv_format)

In [5]:
# Top categories
cats_df = pd.read_csv('node_cats.csv', dtype={'Id': 'str'})
cats_df = cats_df.dropna()

# number of articles per category
cats_df['Category'].value_counts()[:10]

Category
Living_people                            418223
Year_of_birth_missing_(living_people)     34721
English-language_films                    22699
American_films                            15302
American_film_actors                      13938
Main_Belt_asteroids                       13704
Black-and-white_films                     12174
American_television_actors                11661
The_Football_League_players                9467
English_footballers                        9237
Name: count, dtype: int64

## Group each ID's categories

In [6]:
# create list of living_people page IDs 
people_ids = cats_df[cats_df['Category'] == 'Living_people']['Id'].values.tolist()
# combine each id's categories into one column
cats_df = cats_df.groupby('Id')['Category'].agg(list).reset_index()

# Nodes

## Space-delimited to comma-delimited

In [7]:
with open('original data/wiki-topcats-page-names.txt') as file:
    with open('all_nodes.csv', 'w', newline='') as csv_file:
        # initialize writer and write column names
        writer = csv.writer(csv_file)
        writer.writerow(['Id', 'Label'])
        
        lines = file.readlines()
        for line in lines:
            # select id and label
            regex = re.search(r'(?P<id>\d+)\s(?P<label>.+)', line)
            
            # skip lines with missing id or label
            try:
                regex.group('id')
                regex.group('label')
            except:
                continue
            
            # create list to write to csv
            csv_format = [[regex.group('id'), regex.group('label')]]
            writer.writerows(csv_format)

## Select living_people and add their categories

In [8]:
nodes_df = pd.read_csv('all_nodes.csv', dtype={'Id': 'str'})
# select living_people
nodes_df = nodes_df[nodes_df['Id'].isin(people_ids)]
# merge categories into nodes
nodes_df = nodes_df.merge(cats_df, on='Id')
nodes_df = nodes_df.set_index('Id')

nodes_df

Unnamed: 0_level_0,Label,Category
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
52,Hung Huang,"[Living_people, Chinese_actors, Vassar_College..."
62,Richard Cytowic,"[Living_people, People_from_Trenton,_New_Jerse..."
64,Cretien van Campen,"[Living_people, Utrecht_University_alumni]"
66,James Wannerton,"[People_from_Blackpool, Living_people]"
70,Marissa Paternoster,"[Living_people, Musicians_from_New_Jersey, Ame..."
...,...,...
1791474,Stan McGarvey,"[Living_people, Year_of_birth_missing_(living_..."
1791478,Sergey Alexeyevich Kiselyov,"[Living_people, Russian_footballers]"
1791482,Bobby Kerr (footballer),"[Blackpool_F.C._players, Sunderland_A.F.C._pla..."
1791486,Peter Grummitt,"[Brighton_&_Hove_Albion_F.C._players, English_..."


# Read and filter edges

In [9]:
edges_df = pd.read_csv('original data/wiki-topcats.txt', sep='\s+', names=['Source', 'Target'], dtype={'Source': 'str', 'Target': 'str'})
# filter where source and target are in people_ids
edges_df = edges_df[((edges_df['Source'].isin(people_ids)) & (edges_df['Target'].isin(people_ids)))]
edges_df = edges_df.set_index('Source')
edges_df

Unnamed: 0_level_0,Target
Source,Unnamed: 1_level_1
52,401135
52,1069112
52,1163551
62,12162
62,167659
...,...
1790898,1790901
1791431,330401
1791488,1495519
1791488,1496048


# Export CSVs

In [10]:
nodes_df.to_csv('filtered_nodes.csv')
edges_df.to_csv('filtered_edges.csv')