In [1]:
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

In [2]:
finished_paths = pd.read_csv("data/clean_data/clean_finished_paths.csv")
unfinished_paths = pd.read_csv("data/clean_data/clean_unfinished_paths.csv")

print(finished_paths.shape)
print(unfinished_paths.shape)

(51249, 16)
(24802, 17)


In [3]:
popular_pairs = finished_paths.groupby(["source", "target"]).size().reset_index(name="count")
top_pairs = popular_pairs.sort_values("count", ascending=False).head(10)
top_pairs

Unnamed: 0,source,target,count
2318,Asteroid,Viking,1043
4427,Brain,Telephone,1040
25683,Theatre,Zebra,905
21238,Pyramid,Bean,642
3206,Batman,Wood,148
3825,Bird,Great_white_shark,138
3201,Batman,The_Holocaust,119
3822,Bird,Adolf_Hitler,107
3498,Beer,Sun,99
3174,Batman,Banana,69


In [4]:
# total number of games for popular pairs
top_pairs_count = top_pairs["count"].sum()
print(top_pairs_count)

# total number of games
total_games = finished_paths.shape[0]
print(total_games)


#plot the top 100 pairs counts
top_pairs["source_target"] = top_pairs["source"] + " -> " + top_pairs["target"]
fig = px.bar(top_pairs, x="count", y="source_target", orientation='h', title="Top 10 pairs of source and target", height=500) 
fig.show()

print("Number of pairs in whole dataset: ")
nb_pairs = finished_paths.groupby(["source", "target"]).size().reset_index(name="count").shape[0]
print(nb_pairs)


4310
51249


Number of pairs in whole dataset: 
28701


In [5]:
categories = pd.read_csv("data/wikispeedia_paths-and-graph/categories.tsv", sep="\t", names=["article", "category"])

top_pairs = top_pairs.merge(categories, left_on="source", right_on="article", how="left")
top_pairs = top_pairs.merge(categories, left_on="target", right_on="article", how="left")
top_pairs.rename(columns={"category_x": "source_category", "category_y": "target_category"}, inplace=True)
top_pairs.drop(columns=["article_x", "article_y"], inplace=True)

top_pairs["source_category"] = top_pairs["source_category"].apply(lambda x: x.split(".")[1])
top_pairs["target_category"] = top_pairs["target_category"].apply(lambda x: x.split(".")[1])
save_path = "data/clean_data/clean_top_pairs.csv"
top_pairs.to_csv(save_path, index=False)

In [6]:
# Create a dataframe for the alluvial plot
alluvial_data = top_pairs.groupby(['source_category', 'target_category']).size().reset_index(name='count')
category_colors = {
    'Arts': 'blue',
    'History': 'red',
    'Geography': 'green',
    'Mathematics': 'orange',
    'Everyday_life': 'purple',
    'People': 'pink',
    'Philosophy': 'gray',
    'Reference': 'olive',
    'Science': 'olive',
    'Society': 'magenta',
    'Language_and_literature': 'lime',
    'Design_and_Technology': 'teal',
    'Art': 'cyan',
    'Business_Studies': 'gold',
}
# Create the alluvial plot
fig_alluvial = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=alluvial_data['source_category'].tolist() + alluvial_data['target_category'].tolist(),
        color=[category_colors[cat] for cat in alluvial_data['source_category'].tolist() + alluvial_data['target_category'].tolist()]
    ),
    link=dict(
        source=[alluvial_data['source_category'].tolist().index(cat) for cat in alluvial_data['source_category']],
        target=[len(alluvial_data['source_category'].tolist()) + alluvial_data['target_category'].tolist().index(cat) for cat in alluvial_data['target_category']],
        value=alluvial_data['count'],
        color=[category_colors[cat] for cat in alluvial_data['source_category']]
    )
)])

fig_alluvial.update_layout(title_text="Alluvial Plot of Source Category to Target Category", font_size=10)
fig_alluvial.show()

In [7]:
# Plot bar chart for source categories distribution in finished paths


source_category_distribution = finished_paths.groupby('source_general_category').count().rename(columns={'source': 'count'}).reset_index()
source_category_distribution.sort_values(by='count', ascending=False, inplace=True)
target_category_distribution = finished_paths.groupby('target_general_category').count().rename(columns={'target': 'count'}).reset_index()
target_category_distribution.sort_values(by='count', ascending=False, inplace=True)
fig = go.Figure()
fig.add_trace(go.Bar(
    x=source_category_distribution['source_general_category'], 
    y=source_category_distribution['count'], 
    marker_color='blue',
    name='Source Category'
    ))
fig.add_trace(go.Bar(
    x=target_category_distribution['target_general_category'], 
    y=target_category_distribution['count'], 
    marker_color='red',
    name='Target Category'
    ))
fig.update_layout(
    title_text='Source Category Distribution in Finished Paths', 
    xaxis_title='Source Category', 
    yaxis_title='Count',

    )

fig.show()