In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import ast
import networkx as nx
from collections import Counter
os.chdir("./../..")
print(os.getcwd())


/home/gabri/Desktop/ADA/ada-2024-project-theadacuates


In [None]:
df_raw = pd.read_csv('./data/output/base_data/raw_processed.csv')   
df_processed = pd.read_csv('./data/output/base_data/paths_processed.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/output/base_data/raw_processed.csv'

# 1. Compare Row counts

In [34]:
raw_row_count = df_raw.shape[0]
processed_row_count = df_processed.shape[0]
removed_paths = raw_row_count - processed_row_count

print(f"Total rows in raw data: {raw_row_count}")
print(f"Total rows in processed data: {processed_row_count}")
print(f"Paths removed during preprocessing: {removed_paths}")


Total rows in raw data: 76193
Total rows in processed data: 74140
Paths removed during preprocessing: 2053


# 2. Compare Path Completion Status

In [35]:
# Count finished vs. unfinished in raw data
raw_finished_counts = df_raw['finished'].value_counts()
print("Raw Data - Finished/Unfinished Counts:\n", raw_finished_counts)

# Count finished vs. unfinished in processed data
processed_finished_counts = df_processed['finished'].value_counts()
print("Processed Data - Finished/Unfinished Counts:\n", processed_finished_counts)


Raw Data - Finished/Unfinished Counts:
 finished
True     51318
False    24875
Name: count, dtype: int64
Processed Data - Finished/Unfinished Counts:
 finished
True     51280
False    22860
Name: count, dtype: int64


# 3. Check Balance between finished and unfinished path


In [36]:
# Calculate percentage of finished vs. unfinished paths in each dataset
raw_finished_ratio = raw_finished_counts[True] / raw_row_count
processed_finished_ratio = processed_finished_counts[True] / processed_row_count

print(f"Percentage of finished paths in raw data: {raw_finished_ratio:.2%}")
print(f"Percentage of finished paths in processed data: {processed_finished_ratio:.2%}")


Percentage of finished paths in raw data: 67.35%
Percentage of finished paths in processed data: 69.17%


# 4. Find Most Frequent pages

In [49]:
# Convert path_list strings to actual lists if necessary
df_processed['path_list'] = df_processed['path_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Flatten the list of pages and count occurrences of each page
all_pages = [page for path in df_processed['path_list'] for page in path]
page_counts = Counter(all_pages)

# Display the top 20 most frequent pages
top_20_pages = page_counts.most_common(20)
print("Top 20 Most Frequent Pages (Hubs):\n", top_20_pages)


Top 20 Most Frequent Pages (Hubs):
 [('<', 33307), ('United States', 12438), ('Europe', 5606), ('United Kingdom', 5321), ('England', 4550), ('Earth', 4209), ('Africa', 3586), ('World War II', 2930), ('North America', 2590), ('Animal', 2467), ('Human', 2402), ('Mammal', 2331), ('Brain', 2324), ('France', 2247), ('Germany', 2221), ('English language', 2026), ('Computer', 1951), ('Science', 1918), ('Theatre', 1901), ('Asteroid', 1892)]


# Find most frequent pages using PageRank

In [52]:
# Unweighted, directed graph
G_unweighted = nx.DiGraph()
for path in df_processed['path_list']:
    nx.add_path(G_unweighted, path)

# Weighted, directed graph based on the frequency of connections
G_weighted = nx.DiGraph()
for path in df_processed['path_list']:
    for i in range(len(path) - 1):
        source, target = path[i], path[i + 1]
        if G_weighted.has_edge(source, target):
            G_weighted[source][target]['weight'] += 1
        else:
            G_weighted.add_edge(source, target, weight=1)

In [53]:
# Degree, Betweenness, Closeness Centrality, and PageRank for weighted graph
degree_centrality = nx.degree_centrality(G_weighted)
betweenness_centrality = nx.betweenness_centrality(G_weighted, weight='weight')
closeness_centrality = nx.closeness_centrality(G_weighted)
pagerank = nx.pagerank(G_weighted, weight='weight')

# Combine into DataFrame for easy comparison
metrics_df = pd.DataFrame({
    'Degree Centrality': degree_centrality,
    'Betweenness Centrality': betweenness_centrality,
    'Closeness Centrality': closeness_centrality,
    'PageRank': pagerank
}).sort_values(by='PageRank', ascending=False)

print("Top Pages by Network Metrics:\n", metrics_df.head(10))

Top Pages by Network Metrics:
                 Degree Centrality  Betweenness Centrality  \
<                        1.290500                0.539928   
United States            0.366595                0.022428   
Europe                   0.203159                0.013779   
United Kingdom           0.201484                0.011669   
England                  0.175879                0.004324   
Africa                   0.122996                0.011625   
Earth                    0.082556                0.002377   
World War II             0.150275                0.006469   
North America            0.088777                0.005367   
France                   0.137114                0.005085   

                Closeness Centrality  PageRank  
<                           0.748388  0.072120  
United States               0.580175  0.028445  
Europe                      0.535289  0.012747  
United Kingdom              0.532153  0.012233  
England                     0.507222  0.010827  
Afr

# 5. Number of ingoing and outgoing links for the top hubs

In [54]:
# Select the top 10 pages by PageRank from metrics_df
top_pages = metrics_df.head(10).index  # Extract the top page names based on PageRank

# Calculate ingoing and outgoing link counts for top pages
ingoing_outgoing_links = []
for page in top_pages:
    ingoing_links = G_weighted.in_degree(page, weight='weight')  # Number of ingoing links (weighted)
    outgoing_links = G_weighted.out_degree(page, weight='weight')  # Number of outgoing links (weighted)
    ingoing_outgoing_links.append((page, ingoing_links, outgoing_links))

# Create a DataFrame for better visualization
links_df = pd.DataFrame(ingoing_outgoing_links, columns=['Page', 'Ingoing Links', 'Outgoing Links'])
print("Top Pages with Ingoing and Outgoing Links:\n", links_df)


Top Pages with Ingoing and Outgoing Links:
              Page  Ingoing Links  Outgoing Links
0               <          33307           31088
1   United States          12389           11854
2          Europe           5565            5414
3  United Kingdom           5300            5034
4         England           4411            4195
5          Africa           3490            3425
6           Earth           4168            4035
7    World War II           2895            2766
8   North America           2547            2432
9          France           2225            2077


# 6. Categories

In [None]:
# Load categories data 
df_categories = pd.read_csv('./data/output/base_data/categories_processed.csv')
df_categories = df_categories[['article_name', 'article_category_1']]
df_categories = df_categories.rename(columns={'article_category_1': 'primary_category'})

# Remove duplicates to ensure each article has a unique category
df_categories = df_categories.drop_duplicates(subset=['article_name'])

# Merge to add start category based on `source_link`
df_processed = df_processed.merge(
    df_categories,
    left_on='source_link',
    right_on='article_name',
    how='left'
).rename(columns={'primary_category': 'start_category'})

# Merge to add end category based on `target_link`
df_processed = df_processed.merge(
    df_categories,
    left_on='target_link',
    right_on='article_name',
    how='left'
).rename(columns={'primary_category': 'end_category'})

# Drop redundant columns from merging
df_processed = df_processed.drop(columns=['article_name_x', 'article_name_y'])

# Count paths that start in one category and end in another
category_path_counts = df_processed.groupby(['start_category', 'end_category']).size().unstack(fill_value=0)

# Display the result
print("Number of Paths by Start and End Categories:\n", category_path_counts)

Number of Paths by Start and End Categories:
 end_category             Art  Business Studies  Citizenship  Countries  \
start_category                                                           
Art                        6                14           22         11   
Business Studies          18                42           55         46   
Citizenship               16                63          144         82   
Countries                  7                31           71         65   
Design and Technology     32                54          131         75   
Everyday life             44               144          366        203   
Geography                 90               247          589        412   
History                   56                99          281        179   
IT                         6                24           49         65   
Language and literature   11                32          104         54   
Mathematics                3                11           42       

In [59]:
# Top Start Categories
top_start_categories = df_processed['start_category'].value_counts().head(10)
print("Top Start Categories:\n", top_start_categories)

# Top Finish Categories
top_finish_categories = df_processed['end_category'].value_counts().head(10)
print("\nTop Finish Categories:\n", top_finish_categories)



Top Start Categories:
 start_category
Science                    20316
Geography                  12725
Everyday life               7815
People                      6832
History                     5970
Design and Technology       4371
Language and literature     3920
Citizenship                 2915
Religion                    1989
Countries                   1708
Name: count, dtype: int64

Top Finish Categories:
 end_category
Science                    17823
Geography                  13745
Everyday life               8400
History                     7359
People                      7226
Design and Technology       4723
Citizenship                 2991
Religion                    2167
Language and literature     2027
Countries                   1992
Name: count, dtype: int64


In [60]:
# Assuming `top_pages` contains the names of the top hub pages
# and `df_categories` has columns 'article_name' and 'primary_category'

# Filter df_categories to only include the top hub pages
hub_categories = df_categories[df_categories['article_name'].isin(top_pages)]

# Count the most common categories among the top hubs
top_hub_categories = hub_categories['primary_category'].value_counts().head(10)

print("Most Common Categories Among Top Hubs:\n", top_hub_categories)

Most Common Categories Among Top Hubs:
 primary_category
Geography    4
Countries    3
Science      1
History      1
Name: count, dtype: int64
