In [1]:
from src.data_loader import *
from src.helpers import *

In [2]:
df_categories = read_categories()
df_finished = read_finished_paths()
df_unfinished = read_unfinished_paths()

# Drop all rows with Pikachu in target source and path (because pikachu doens't have category)
# migth handle it better by puting none for the pickachu value or be a category by itself
df_finished = df_finished.loc[~df_finished['path'].str.contains('Pikachu'), :]
df_unfinished = df_unfinished.loc[~df_unfinished['target'].str.contains('Pikachu'), :]
df_unfinished = df_unfinished.loc[~df_unfinished['path'].str.contains('Pikachu'), :]

df_categories_filtered = filter_most_specific_category(df_categories)

Finished Paths 
Number of rows before filtering: 51318
Invalid articles found in path: {'Wikipedia Text of the GNU Free Documentation License', 'Pikachu'}
Number of rows after filtering: 51210 

Unfinished Paths 
Number of rows before filtering: 24875
Invalid target articles found: {'The Rock', 'Bogota', 'Great', 'Usa', 'Sportacus', 'Rss', 'Mustard', 'Test', 'Georgia', 'Netbook', 'Black ops 2', 'Fats', 'The', 'Macedonia', 'Kashmir', 'Western Australia', 'Pikachu', 'Rat', 'Christmas', 'Adolph Hitler', 'Charlottes web', 'Long peper', 'English', ' Zebra', 'Podcast'}
Invalid articles found in path: {'Wikipedia Text of the GNU Free Documentation License', 'Pikachu'}
Number of rows after filtering: 24721 



In [3]:
tasks_finished = pd.DataFrame()

tasks_finished['source'] = df_finished['path'].apply(lambda x: x.split(';')[0])
tasks_finished['target'] = df_finished['path'].apply(lambda x: x.split(';')[-1])
df_unfinished['source'] = df_unfinished['path'].apply(lambda x: x.split(';')[0])
tasks_finished = pd.concat([tasks_finished, df_unfinished[['source', 'target']]], ignore_index=True)

tasks_finished = tasks_finished.drop_duplicates(subset=['source', 'target'])
tasks_finished

Unnamed: 0,source,target
0,14th century,African slave trade
3,14th century,Greece
4,14th century,John F. Kennedy
6,14th century,Fire
7,14th century,Rainbow
...,...,...
75924,Wine,History of post-Soviet Russia
75925,Turks and Caicos Islands,Iraq War
75926,Franz Kafka,Cholera
75927,Modern history,Hollandic


In [4]:
links = read_links()
# One pair source-target has an article which is not in links
# Get articles from both linkSource and linkTarget columns
unique_nodes = set(links['linkSource']).union(set(links['linkTarget']))
# Keep rows where both source and target are in the articles set
tasks_finished = tasks_finished[tasks_finished['source'].isin(unique_nodes) & tasks_finished['target'].isin(unique_nodes)]

tasks_finished


Unnamed: 0,source,target
0,14th century,African slave trade
3,14th century,Greece
4,14th century,John F. Kennedy
6,14th century,Fire
7,14th century,Rainbow
...,...,...
75924,Wine,History of post-Soviet Russia
75925,Turks and Caicos Islands,Iraq War
75926,Franz Kafka,Cholera
75927,Modern history,Hollandic


In [5]:
import networkx as nx

# Build the directed graph from the links
G = nx.DiGraph()
G.add_edges_from(links[['linkSource', 'linkTarget']].itertuples(index=False, name=None))

# Find shortest paths for each unique (source, target) pair
def find_shortest_path(row):
    source, target = row['source'], row['target']
    try:
        # Use NetworkX to find the shortest path
        path = nx.shortest_path(G, source=source, target=target)
    except nx.NetworkXNoPath:
        path = None  # If no path exists
    return path

tasks_finished.loc[:, 'shortest_path'] = tasks_finished.apply(find_shortest_path, axis=1)
tasks_finished

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tasks_finished.loc[:, 'shortest_path'] = tasks_finished.apply(find_shortest_path, axis=1)


Unnamed: 0,source,target,shortest_path
0,14th century,African slave trade,"[14th century, Lithuania, Pope John Paul II, A..."
3,14th century,Greece,"[14th century, England, Greece]"
4,14th century,John F. Kennedy,"[14th century, China, United States, John F. K..."
6,14th century,Fire,"[14th century, China, Gunpowder, Fire]"
7,14th century,Rainbow,"[14th century, Time, Isaac Newton, Rainbow]"
...,...,...,...
75924,Wine,History of post-Soviet Russia,"[Wine, Judaism, Khazars, History of post-Sovie..."
75925,Turks and Caicos Islands,Iraq War,"[Turks and Caicos Islands, Canada, NATO, Iraq ..."
75926,Franz Kafka,Cholera,"[Franz Kafka, 20th century, Paris, Cholera]"
75927,Modern history,Hollandic,"[Modern history, Earth, Afrikaans, Hollandic]"


In [6]:
matrix_path_length = read_shortest_path_matrix()
matrix_path_length

article,Áedán mac Gabráin,Åland,Édouard Manet,Éire,Óengus I of the Picts,€2 commemorative coins,10th century,11th century,12th century,13th century,...,Ziad Jarrah,Zimbabwe,Zinc,Zinc chloride,Zion National Park,Zionism,Zirconium,Zoroaster,Zuid-Gelders,Zulu
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Áedán mac Gabráin,0,-1,-1,-1,-1,-1,3,3,3,3,...,4,3,3,4,4,3,4,4,4,2
Åland,-1,0,-1,-1,-1,-1,2,2,2,2,...,4,2,3,4,4,3,4,3,3,3
Édouard Manet,-1,-1,0,-1,-1,-1,3,3,2,2,...,4,3,2,3,4,3,4,3,3,3
Éire,-1,-1,-1,0,-1,-1,3,3,3,3,...,4,2,2,3,4,3,4,4,3,3
Óengus I of the Picts,-1,-1,-1,-1,0,-1,2,2,3,2,...,4,2,3,4,4,3,4,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zionism,-1,-1,-1,-1,-1,-1,2,2,2,2,...,3,2,2,3,3,0,3,3,3,2
Zirconium,-1,-1,-1,-1,-1,-1,3,3,3,3,...,3,3,2,2,3,3,0,3,4,3
Zoroaster,-1,-1,-1,-1,-1,-1,2,2,2,2,...,3,2,2,3,4,3,3,0,3,3
Zuid-Gelders,-1,-1,-1,-1,-1,-1,3,3,3,3,...,4,3,3,4,4,3,5,4,0,3


In [7]:
def compare_with_matrix(row):
    source, target = row['source'], row['target']
    # Retrieve the corresponding matrix path length for source-target
    matrix_length = matrix_path_length.loc[source, target]
    
    # Compute the path length from shortest_path, if it exists
    computed_length = len(row['shortest_path']) -1 if row['shortest_path'] is not None else -1
    
    matches_matrix = computed_length == matrix_length
    return computed_length, matrix_length, matches_matrix

tasks_finished[['computed_length', 'matrix_length', 'matches_matrix']] = tasks_finished.apply(compare_with_matrix, axis=1, result_type='expand')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tasks_finished[['computed_length', 'matrix_length', 'matches_matrix']] = tasks_finished.apply(compare_with_matrix, axis=1, result_type='expand')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tasks_finished[['computed_length', 'matrix_length', 'matches_matrix']] = tasks_finished.apply(compare_with_matrix, axis=1, result_type='expand')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pyd

In [8]:
# Check if any values in the matches_matrix column are False
any_false = not tasks_finished['matches_matrix'].all()

if any_false:
    print("There are rows where computed length does not match the matrix length.")
else:
    print("All computed lengths match the matrix lengths.")

tasks_finished=tasks_finished.dropna()
tasks_finished


All computed lengths match the matrix lengths.


Unnamed: 0,source,target,shortest_path,computed_length,matrix_length,matches_matrix
0,14th century,African slave trade,"[14th century, Lithuania, Pope John Paul II, A...",3,3,True
3,14th century,Greece,"[14th century, England, Greece]",2,2,True
4,14th century,John F. Kennedy,"[14th century, China, United States, John F. K...",3,3,True
6,14th century,Fire,"[14th century, China, Gunpowder, Fire]",3,3,True
7,14th century,Rainbow,"[14th century, Time, Isaac Newton, Rainbow]",3,3,True
...,...,...,...,...,...,...
75924,Wine,History of post-Soviet Russia,"[Wine, Judaism, Khazars, History of post-Sovie...",3,3,True
75925,Turks and Caicos Islands,Iraq War,"[Turks and Caicos Islands, Canada, NATO, Iraq ...",3,3,True
75926,Franz Kafka,Cholera,"[Franz Kafka, 20th century, Paris, Cholera]",3,3,True
75927,Modern history,Hollandic,"[Modern history, Earth, Afrikaans, Hollandic]",3,3,True


In [9]:
def analyze_categories_paths(df_paths, df_categories, omit_loops=False):
    """
    Analyze the paths to find common paths.
    Optionally omit consecutive repetitions of the same category in paths.
    """
    # Map articles to main categories
    article_to_category = dict(zip(df_categories['article'], df_categories['level_1']))
    
    category_paths = []
    path_counts = {}
    
    for path in df_paths['shortest_path']:
        articles = path
        categories = [article_to_category.get(article, article) for article in articles]

        # Remove consecutive duplicate categories if omit_loops is True
        if omit_loops:
            categories = [category for i, category in enumerate(categories) 
                          if i == 0 or category != categories[i - 1]]

        # Create a string representation of the category path
        category_path = ' -> '.join(categories)
        category_paths.append(category_path)
        
        # Count path occurrences
        if category_path in path_counts:
            path_counts[category_path] += 1
        else:
            path_counts[category_path] = 1
    
    # Most common paths
    sorted_paths = sorted(path_counts.items(), key=lambda x: x[1], reverse=True)
    df_common_paths = pd.DataFrame(sorted_paths, columns=['Category Path', 'Count'])
    
    return df_common_paths

In [10]:
optimal_paths = analyze_categories_paths(tasks_finished, df_categories_filtered, omit_loops=False)
optimal_paths.to_csv("optimal_paths.csv", index=False)

In [11]:
# Calculate position frequencies (normalized or not)
df_position_data = get_position_frequencies(optimal_paths, max_position=15, normalize=True)

# Plot interactive histogram (line or bar plot, with options for normalization and stacking)
plot_position_interactive(df_position_data, plot_type="line", normalized=True)

In [12]:
# Calculate position frequencies (normalized or not)
df_position_data = get_position_frequencies(optimal_paths, max_position=15)

# Plot interactive histogram (line or bar plot, with options for normalization and stacking)
plot_position_interactive(df_position_data, plot_type="line")