In [12]:
import pandas as pd
import numpy as np
from urllib.parse import unquote

# Specify the file path using a raw string (add 'r' prefix)
file_path = r'C:\Users\tonyh\Desktop\ADA\ada-2023-project-oddbits\Data\wikispeedia_paths-and-graph\shortest-path-distance-matrix.txt'

# Read the contents of the file excluding lines starting with #
with open(file_path, 'r') as file:
    lines = file.readlines()
    input_matrix = ''.join(line for line in lines if not line.startswith('#'))

# Split the input matrix into rows
rows = input_matrix.strip().split('\n')

# Initialize an empty 2D list to store the distances
distances = []

In [9]:
# Iterate through each row and convert it into a list of integers
for row in rows:
    row_values = []
    for char in row:
        if char.isdigit():
            row_values.append(int(char))
        elif char == '_':
            row_values.append(None)
    distances.append(row_values)

# Convert the distances list to a NumPy array for easier computation
distance_matrix = np.array(distances)

In [22]:
# Compute the centrality for each article using closeness centrality
centrality = []
num_articles = len(distance_matrix)

for i in range(num_articles):
    Farness = sum(distance_matrix[i, j] for j in range(num_articles) if j != i and distance_matrix[i, j] is not None)
    #avg_shortest_path_length = total_distance / (num_articles - 1)  # Exclude the node itself
    closeness_centrality = 1 / Farness if Farness != 0 else 0
    centrality.append(closeness_centrality)

# Create a DataFrame with article index and centrality values
df_centrality = pd.DataFrame({
    'Article': range(num_articles),
    'Closeness Centrality': centrality
})

# Print the DataFrame
display(df_centrality)

Unnamed: 0,Article,Closeness Centrality
0,0,0.000073
1,1,0.000079
2,2,0.000078
3,3,0.000074
4,4,0.000073
...,...,...
4599,4599,0.000088
4600,4600,0.000083
4601,4601,0.000078
4602,4602,0.000064


In [23]:
articles_file_path = r'C:\Users\tonyh\Desktop\ADA\ada-2023-project-oddbits\Data\wikispeedia_paths-and-graph\articles.tsv'

# Read the articles from the articles.tsv file
with open(articles_file_path, 'r') as articles_file:
    articles_lines = articles_file.readlines()

# Extract article names from the articles.tsv file
article_names = [unquote(line.strip()) for line in articles_lines if not line.startswith('#') and line.strip() != 'articles']

In [24]:
# Create a DataFrame with article names and centrality values
df_centrality = pd.DataFrame({
    'Article': article_names,
    'Closeness Centrality': centrality
})

# Print the DataFrame
display(df_centrality)

Unnamed: 0,Article,Closeness Centrality
0,Áedán_mac_Gabráin,0.000073
1,Åland,0.000079
2,Édouard_Manet,0.000078
3,Éire,0.000074
4,Óengus_I_of_the_Picts,0.000073
...,...,...
4599,Zionism,0.000088
4600,Zirconium,0.000083
4601,Zoroaster,0.000078
4602,Zuid-Gelders,0.000064


In [61]:
categories_file_path = r'C:\Users\tonyh\Desktop\ADA\ada-2023-project-oddbits\Data\wikispeedia_paths-and-graph\categories.tsv'

# Read the categories from the categories.tsv file
with open(categories_file_path, 'r') as categories_file:
    categories_lines = categories_file.readlines()

# Create a dictionary to store the first category for each article
article_first_categories = {}
for line in categories_lines:
    if not line.startswith('#'):
        article, *categories = map(str.strip, map(unquote, line.strip().split('\t')))
        first_category = next((cat.split('.')[-1] for cat in categories if cat.startswith('subject.Countries')), None)
        if first_category is None:
            first_category = next((cat.split('.')[-2] for cat in categories if cat.startswith('subject.Countries.')), None)
        if first_category is not None:
            article_first_categories[article] = first_category

In [62]:
# Create a DataFrame with article names, first categories, and centrality values
df_centrality_first_categories = pd.DataFrame({
    'Article': article_names,
    'First Category': [article_first_categories.get(article, None) for article in article_names],
    'Closeness Centrality': centrality
})

# Print the DataFrame
display(df_centrality_first_categories)

Unnamed: 0,Article,First Category,Closeness Centrality
0,Áedán_mac_Gabráin,,0.000073
1,Åland,Countries,0.000079
2,Édouard_Manet,,0.000078
3,Éire,Countries,0.000074
4,Óengus_I_of_the_Picts,,0.000073
...,...,...,...
4599,Zionism,,0.000088
4600,Zirconium,,0.000083
4601,Zoroaster,,0.000078
4602,Zuid-Gelders,,0.000064


In [63]:
# Filter articles by the "Countries" category
countries_df = df_centrality_first_categories[df_centrality_first_categories['First Category'] == 'Countries']

# Print the resulting DataFrame
display(countries_df)

Unnamed: 0,Article,First Category,Closeness Centrality
1,Åland,Countries,0.000079
3,Éire,Countries,0.000074
94,Abkhazia,Countries,0.000081
101,Abu_Dhabi,Countries,0.000080
127,Afghanistan,Countries,0.000087
...,...,...,...
4448,West_Bank,Countries,0.000082
4453,Western_Sahara,Countries,0.000082
4568,Yemen,Countries,0.000090
4587,Zambia,Countries,0.000086


In [66]:
sorted_countries_df = countries_df.sort_values(['Closeness Centrality'])

display(sorted_countries_df)

Unnamed: 0,Article,First Category,Closeness Centrality
1594,French_Polynesia,Countries,0.000073
3695,Serbia_and_Montenegro,Countries,0.000074
3,Éire,Countries,0.000074
4257,Turkish_Republic_of_Northern_Cyprus,Countries,0.000076
3808,South_Ossetia,Countries,0.000076
...,...,...,...
2433,Lebanon,Countries,0.000093
1694,Germany,Countries,0.000094
4293,United_Kingdom,Countries,0.000095
331,Argentina,Countries,0.000095
