In [1]:
import re
import pandas as pd
import csv

# Categories

## Space-delimited to comma-delimited

In [2]:
with open('original data/wiki-topcats-categories.txt') as file:
    with open('exported csvs/node_cats.csv', 'w', newline='') as csv_file:
        # initialize writer and write column names
        writer = csv.writer(csv_file)
        writer.writerow(['Id', 'Category'])
        
        lines = file.readlines()
        for line in lines:
            regex = re.search(r'^\w+:(?P<cat>[^;]+);\s(?P<IDs>[\d\s]+)', line)
            id_list = re.sub('\s', ',', regex.group('IDs')).split(',')
            
            for id in id_list:
                csv_format = [[id, regex.group('cat')]]
                writer.writerows(csv_format)

In [3]:
cats_df = pd.read_csv('exported csvs/node_cats.csv', dtype={'Id': 'str'})
cats_df = cats_df.dropna()
cats_df

Unnamed: 0,Id,Category
0,301,Buprestoidea
1,302,Buprestoidea
2,303,Buprestoidea
3,304,Buprestoidea
4,305,Buprestoidea
...,...,...
4776888,1791426,Tachinidae
4776889,1791427,Tachinidae
4776890,1791428,Tachinidae
4776891,1791429,Tachinidae


## Create lists of node IDs in people and science categories

In [125]:
# create list of people page IDs 
people_ids = cats_df[cats_df['Category'] == 'Living_people']['Id'].values.tolist()
# create list of science page IDs
branches = ['physics', 'chemistry', 'biology', 'engineering', 'math', 'psychology', 'sociology']
sci_cat_match = '|'.join(branches)
sci_ids = cats_df[cats_df['Category'].str.contains(sci_cat_match, case=False)]['Id'].unique().tolist()

In [126]:
combined_id = people_ids + sci_ids
cats_df = cats_df[cats_df['Id'].isin(combined_id)]
cats_df

Unnamed: 0,Id,Category
546,1058,People_from_Worcester
547,1059,People_from_Worcester
550,76515,People_from_Worcester
552,78094,People_from_Worcester
553,79069,People_from_Worcester
...,...,...
4774243,1788135,Sri_Lankan_Tamil_politicians
4774244,1788136,Sri_Lankan_Tamil_politicians
4774252,1788253,Sri_Lankan_Tamil_politicians
4774255,1788288,Sri_Lankan_Tamil_politicians


## Group each ID's categories into single column

In [127]:
# combine each id's categories into one column
cats_df = cats_df.groupby('Id')['Category'].agg(list).reset_index()
cats_df

Unnamed: 0,Id,Category
0,1000004,"[Living_people, German_musicians, German_compo..."
1,1000005,"[Living_people, People_from_Hamburg, People_fr..."
2,1000006,"[Living_people, People_from_Budapest]"
3,1000007,"[Living_people, Year_of_birth_missing_(living_..."
4,1000008,"[People_from_Tel_Aviv, Israeli_Jews, Living_pe..."
...,...,...
433453,999989,"[Living_people, German_actors, German_televisi..."
433454,999990,"[Living_people, People_from_Frankfurt, German-..."
433455,999991,"[Living_people, People_from_Frankfurt, German-..."
433456,999992,"[Living_people, German_singers]"


# Nodes

## Space-delimited to comma-delimited

In [None]:
with open('original data/wiki-topcats-page-names.txt') as file:
    with open('exported csvs/all_nodes.csv', 'w', newline='') as csv_file:
        # initialize writer and write column names
        writer = csv.writer(csv_file)
        writer.writerow(['Id', 'Label'])
        
        lines = file.readlines()
        for line in lines:
            # select id and label
            regex = re.search(r'(?P<id>\d+)\s(?P<label>.+)', line)
            
            # skip lines with missing id or label
            try:
                regex.group('id')
                regex.group('label')
            except:
                continue
            
            # create list to write to csv
            csv_format = [[regex.group('id'), regex.group('label')]]
            writer.writerows(csv_format)

## Select people/science nodes and add their categories

In [None]:
nodes_df = pd.read_csv('exported csvs/all_nodes.csv', dtype={'Id': 'str'})

# select living_people
people_nodes = nodes_df[nodes_df['Id'].isin(people_ids)]
# merge categories into nodes
people_nodes = people_nodes.merge(cats_df, on='Id')
people_nodes = people_nodes.set_index('Id')

# select science IDs
sci_nodes = nodes_df[nodes_df['Id'].isin(sci_ids)]
# merge categories into nodes
sci_nodes = sci_nodes.merge(cats_df, on='Id')
sci_nodes = sci_nodes.set_index('Id')

In [130]:
people_nodes

Unnamed: 0_level_0,Label,Category
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
52,Hung Huang,"[Living_people, Chinese_actors, Vassar_College..."
62,Richard Cytowic,"[Living_people, People_from_Trenton,_New_Jerse..."
64,Cretien van Campen,"[Living_people, Utrecht_University_alumni]"
66,James Wannerton,"[People_from_Blackpool, Living_people]"
70,Marissa Paternoster,"[Living_people, Musicians_from_New_Jersey, Ame..."
...,...,...
1791474,Stan McGarvey,"[Living_people, Year_of_birth_missing_(living_..."
1791478,Sergey Alexeyevich Kiselyov,"[Living_people, Russian_footballers]"
1791482,Bobby Kerr (footballer),"[Blackpool_F.C._players, Sunderland_A.F.C._pla..."
1791486,Peter Grummitt,"[Brighton_&_Hove_Albion_F.C._players, English_..."


In [131]:
sci_nodes

Unnamed: 0_level_0,Label,Category
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
4,Zariski's main theorem,"[Mathematical_theorems, Algebraic_geometry]"
5,FultonHansen connectedness theorem,[Mathematical_theorems]
8,Bing's recognition theorem,"[Mathematical_theorems, Geometric_topology]"
175,NPDGamma,"[Nuclear_physics, Quantum_mechanics, Quantum_f..."
182,Krener's theorem,"[Control_theory, Mathematical_theorems]"
...,...,...
1789793,Responsible autonomy,"[Sociology, Management]"
1789808,Nuage,[Cell_biology]
1790812,Social inertia,"[Social_psychology, Sociology]"
1791445,Whitney extension theorem,"[Mathematical_theorems, Mathematical_analysis]"


# Load and filter edges

In [132]:
edges_df = pd.read_csv('original data/wiki-topcats.txt', sep='\s+', names=['Source', 'Target'], dtype={'Source': 'str', 'Target': 'str'})

# filter where source and target are in people_ids
people_edges = edges_df[((edges_df['Source'].isin(people_ids)) & (edges_df['Target'].isin(people_ids)))]
people_edges = people_edges.set_index('Source')

# filter where source and target are in people_ids
sci_edges = edges_df[((edges_df['Source'].isin(sci_ids)) & (edges_df['Target'].isin(sci_ids)))]
sci_edges = sci_edges.set_index('Source')

print(people_edges)
print(sci_edges)

          Target
Source          
52        401135
52       1069112
52       1163551
62         12162
62        167659
...          ...
1790898  1790901
1791431   330401
1791488  1495519
1791488  1496048
1791488  1498126

[1218406 rows x 1 columns]
          Target
Source          
8        1101610
8        1102709
8        1102847
175        16146
175        16175
...          ...
1791445  1102719
1791465     5579
1791465     5644
1791465     5654
1791465    11049

[96771 rows x 1 columns]


# Export CSVs

In [None]:
people_nodes.to_csv('exported csvs/people_nodes.csv')
people_edges.to_csv('exported csvs/people_edges.csv')
sci_nodes.to_csv('exported csvs/sci_nodes.csv')
sci_edges.to_csv('exported csvs/sci_edges.csv')