# Imports

In [33]:
import pandas as pd
import matplotlib.pyplot as plt
from helpers import *

# Pre-processing
Need to synthetise the preprocessing part in this notebook

In [34]:
dfs = preprocessing()

result_dict = create_dictionary_from_tsv("dataset/wikispeedia_paths-and-graph/categories.tsv")

## Palettes

In [35]:
from bokeh.palettes import TolPRGn, Category20c,Iridescent
from bokeh.plotting import figure, show

# Get a list of 21 colors
colors = Category20c

# Print the list of colors
print(Iridescent[23][2:])

('#F5F3C1', '#EAF0B5', '#DDECBF', '#D0E7CA', '#C2E3D2', '#B5DDD8', '#A8D8DC', '#9BD2E1', '#8DCBE4', '#81C4E7', '#7BBCE7', '#7EB2E4', '#88A5DD', '#9398D2', '#9B8AC4', '#9D7DB2', '#9A709E', '#906388', '#805770', '#684957', '#46353A')


# Data Exploration

## Articles repartition

In [36]:
# main categories
categories = dfs['categories'].copy()
# Selection of all the people articles
people_selection = categories[categories['category'].str.contains('subject.People.')]

#Proportion of people vs other categories
len_people = len(people_selection.article.unique())
len_total_article = len(categories.article.unique())
print('There are {:.2f}% of people articles ({} out of {})'.format(len_people/len_total_article*100, len_people, len_total_article))

# Peoples categories
people_main_categories = people_selection.category.str.split('.').apply(lambda x: x[2]).value_counts()
people_main_categories

There are 14.70% of people articles (676 out of 4598)


category
Historical_figures                       108
Writers_and_critics                       71
Political_People                          67
Sports_and_games_people                   45
Monarchs_of_Great_Britain                 43
Performers_and_composers                  43
Religious_figures_and_leaders             39
USA_Presidents                            37
Artists                                   36
Engineers_and_inventors                   25
Actors_models_and_celebrities             25
Human_Scientists                          23
Philosophers                              23
Military_People                           22
Astronomers_and_physicists                21
Geographers_and_explorers                 21
Mathematicians                            15
Computing_People                           9
Chemists                                   8
Producers_directors_and_media_figures      8
Name: count, dtype: int64

# Target repartition

In [37]:
def transform_path_main_category(path, dict_article_target):
    split_path = path.split(';')
    split_cat = []

    for art in split_path:
         if art[0] != '.' : 
            try: art in dict_article_target.keys()
            except IndexError : print('Warning: not valid article:', art)
            if art in dict_article_target.keys():
                split_cat.append(dict_article_target[art]['main_subject'])
    
    return split_cat

## Finished path

In [38]:
finished_path = dfs['paths_finished'].copy()
finished_path['target'] = [path.split(';')[-1] for path in finished_path.path]
target_list = finished_path.target
list_people = people_selection.article.values
len_target_people = len([target for target in target_list if target in list_people])
len_target_all = len(finished_path)
print('There are {:.2f}% of target that are in the people category({} out of {})'.format(len_target_people/len_target_all*100, len_target_people, len_target_all))


finished_path_people = finished_path.loc[finished_path.target.isin(list_people), :]
finished_path_people = finished_path_people.drop(['timestamp', 'durationInSec'], axis = 1)
finished_path_people['path_categories'] = [transform_path_main_category(path, result_dict[0]) for path in finished_path_people['path']]
display(finished_path_people)




There are 11.52% of target that are in the people category(5913 out of 51318)


Unnamed: 0,path,target,path_categories
4,14th_century;Italy;Roman_Catholic_Church;HIV;R...,John_F._Kennedy,"[History, Geography, Religion, Science, People..."
5,14th_century;Europe;North_America;United_State...,John_F._Kennedy,"[History, Geography, Geography, Geography, Cit..."
36,14th_century;England;London;Spanish_Armada;Eng...,Elizabeth_I_of_England,"[History, Geography, Geography, History, Geogr..."
37,14th_century;England;James_I_of_England;Elizab...,Elizabeth_I_of_England,"[History, Geography, People, People]"
38,14th_century;Hundred_Years'_War;Henry_VI_of_En...,Elizabeth_I_of_England,"[History, History, People, History, People, Pe..."
...,...,...,...
51248,StarCraft;Macintosh;IPod;ITunes;Music;Ludwig_v...,Wolfgang_Amadeus_Mozart,"[Everyday_life, IT, Design_and_Technology, Des..."
51278,Thrush_(bird);Bird;Vertebrate;Human;United_Sta...,Michael_Jordan,"[Science, Science, Science, Science, Business_..."
51290,Tropical_Storm_Matthew_(2004);United_States;Me...,Edward_Jenner,"[Geography, Geography, Science, Science, Scien..."
51306,"William_Thomson,_1st_Baron_Kelvin;Glasgow;Lond...",William_Shakespeare,"[People, Geography, Geography, Geography, Lang..."


In [59]:
path_all = pd.DataFrame(finished_path_people.path.str.split(';').explode())
people_path = path_all.copy()[path_all.path.isin(list_people)]
print('There are {:.2f}% of article that are people in paths to people target ({} out of {})'.format(len(people_path)/len(path_all)*100, len(people_path), len(path_all)))
print('The people ratio: %people_in_path/%people_article {:.2f}\nThe non-people ratio: %non_people_in_path/%non_people_article {:.2f}'.format(
    len(people_path)/len(path_all)/(len_people/len_total_article),
    (1-len(people_path))/len(path_all)/((1-len_people)/len_total_article)))
len_people_unique = len(people_path.path.unique())
print('Within the percentage of article that are people, there are {:.2f}% of people articles that are represented ({} out of {}).'.format(len_people_unique/len_people*100,len_people_unique, len_people))

graph_df = pd.DataFrame({'DF':['Dataset'], 'leading to':['all'],'percentage_people':[len_target_people/len_target_all], 'percentage_other':[1-(len_target_people/len_target_all)]})
graph_df.loc[len(graph_df.index)] = ['Paths', 'all', len_people/len_total_article, 1-len_people/len_total_article]
graph_df.loc[len(graph_df.index)] = ['Paths', 'people',len(people_path)/len(path_all), 1-len(people_path)/len(path_all)]
display(graph_df)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ['people', 'non-people']

# Create subplots, using 'domain' type for pie charts
specs = [[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=1, cols=3, specs=specs,subplot_titles=['In the whole dataset', 'In paths players took','In paths finishing by people'])

# Define pie charts
fig.add_trace(go.Pie(labels=labels, values=[graph_df['percentage_people'][0],graph_df['percentage_other'][0]], name='Starry Night'), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=[graph_df['percentage_people'][1],graph_df['percentage_other'][1]], name='Starry Night'), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=[graph_df['percentage_people'][2],graph_df['percentage_other'][2]], name='Starry Night'), 1, 3)

# Tune layout and hover info
fig.update_traces(hoverinfo='label+percent+name', textinfo='percent')
fig.update(layout_title_text='Percentage of articles about people',
           layout_showlegend=False)

fig = go.Figure(fig)
fig.show()

There are 25.49% of article that are people in paths to people target (9626 out of 37761)
The people ratio: %people_in_path/%people_article 1.73
The non-people ratio: %non_people_in_path/%non_people_article 1.74
Within the percentage of article that are people, there are 81.36% of people articles that are represented (550 out of 676).


Unnamed: 0,DF,leading to,percentage_people,percentage_other
0,Dataset,all,0.115223,0.884777
1,Paths,all,0.14702,0.85298
2,Paths,people,0.254919,0.745081


## Unfinished path

In [None]:
unfinished_path = dfs['paths_unfinished'].copy()
unfinished_path['target'] = [path.split(';')[-1] for path in unfinished_path.path]
target_list = unfinished_path.target
list_people = people_selection.article.values
len_target_people = len([target for target in target_list if target in list_people])
len_target = len(finished_path)
print('There are {:.2f}% of target that are in the people category ({} out of {})'.format(len_target_people/len_target*100, len_target_people, len_target))

unfinished_path_people = unfinished_path.loc[unfinished_path.target.isin(list_people), :]
unfinished_path_people = unfinished_path_people.drop(['timestamp', 'durationInSec'], axis = 1)
unfinished_path_people.loc[:, 'path_categories'] = [transform_path_main_category(path, result_dict[0]) for path in unfinished_path_people['path']]

article_path = pd.DataFrame(unfinished_path_people.path.str.split(';').explode())
people_path = article_path.copy()[article_path.path.isin(list_people)]
print('There are {:.2f}% of article that are people in paths to people target ({} out of {})'.format(len(people_path)/len(article_path)*100, len(people_path), len(article_path)))

len_people_unique = len(people_path.path.unique())
print('Within the percentage of article that are people, there are {:.2f}% of people articles that are represented ({} out of {}).'.format(len_people_unique/len_people*100,len_people_unique, len_people))

## People main categories color palette

In [None]:
from bokeh.palettes import Iridescent

color_len = len(people_main_categories)
palette_len = 23
colors = Iridescent[palette_len][palette_len-color_len:]

# Palette
colors_people_main_categories = dict(zip(people_main_categories.index, colors))
print(colors_people_main_categories)

# del color_len, palette_len, colors

# Categories repartition

In [None]:
def create_dictionary_from_tsv(file_path):
    """  Creates a dictionary from a TSV file
    parameter:
        file_path: str
            path to the TSV file
    return:
        all_article: list of str
            name of each articles added as key
        data_dict: dictionary
            article name as key and 3 first subject as values
    """
    data_dict = {}

    with open(file_path, 'r', newline='', encoding='utf-8') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        
        all_articles = []
        for row in reader:
            if len(row) == 2:
                article, subjects = row
                subjects_list = subjects.split('.')
                article = unquote(article)
                all_articles.append(article)

                data_dict[article] = {
                    'main_subject': unquote(subjects_list[1]) if len(subjects_list) >= 2 else None,
                    'secondary_subject': unquote(subjects_list[2]) if len(subjects_list) >= 3 else None,
                    'tertiary_subject': unquote(subjects_list[3]) if len(subjects_list) >= 4 else None
                }
    
    return data_dict, all_articles

In [None]:
# import csv
# from urllib.parse import unquote
# from helpers import *
# import networkx as nx

# # Function to create the dictionary from a TSV file
# def create_dictionary_from_tsv(file_path):
#     data_dict = {}
#     G = nx.DiGraph()
#     with open(file_path, 'r', newline='', encoding='utf-8') as tsvfile:
#         reader = csv.reader(tsvfile, delimiter='\t')
        
#         all_articles = []
#         for row in reader:
#             if len(row) == 2:
#                 article, subjects = row
#                 subjects_list = subjects.split('.')
#                 for i in range(len(subjects_list)-1):
#                     G.add_edge(unquote(subjects_list[i]), unquote(subjects_list[i+1]))
                

#                 article = unquote(article)
#                 all_articles.append(article)

#                 data_dict[article] = {
#                     'main_subject': unquote(subjects_list[1]) if len(subjects_list) >= 2 else None,
#                     'secondary_subject': unquote(subjects_list[2]) if len(subjects_list) >= 3 else None,
#                     'tertiary_subject': unquote(subjects_list[3]) if len(subjects_list) >= 4 else None
#                 }
#     G.remove_edges_from(nx.selfloop_edges(G))
    
#     return data_dict, all_articles, G

# # Example usage
# result_dict, all_articles, graph_cat = create_dictionary_from_tsv("dataset/wikispeedia_paths-and-graph/categories.tsv")

# # Print the result dictionary
# for article, subjects in result_dict.items():
#     print(f"Article: {article}")
#     print(f"Main Subject: {subjects['main_subject']}")
#     print(f"Secondary Subject: {subjects['secondary_subject']}")
#     print(f"Tertiary Subject: {subjects['tertiary_subject']}")
#     print("\n")


## Plot the main categories of people 

In [None]:
people_main_categories

In [None]:
import plotly.express as px

# Create a pie chart using Plotly Express
fig = px.pie(values=people_main_categories.values, names=people_main_categories.index.str.replace('_', ' ' ),
             title='Main category repartition of the people',
             hole=0.1, color_discrete_sequence= px.colors.sequential.Cividis #YlOrRd, Greens_r, YlOrRd(_r), RdBu_r, Blues(_r), Cividis
             #color_discrete_map =  colors_people_main_categories
             )
fig.show()

#fig.write_html("docs/Pie_main_cat_people.html")


In [None]:
individual_category = categories.category.str.split('.').explode().unique()
print('There are {} categories.'.format(len(individual_category)-1))
individual_category

In [None]:
finished_path_people['target_second_category'] = [result_dict[0][t]['secondary_subject'] for t in finished_path_people.target]
finished_path_people['four_last'] = [row.path_categories[-1:-4] + [row.target_second_category] for i, row in finished_path_people.iterrows()]
finished_path_people['one_last_category'] = [''.join(row[-1:-2]) for row in finished_path_people.four_last]
heatmap_ = finished_path_people[['target_second_category', 'one_last_category']]

In [None]:
heatmap_.target_second_category.unique()

In [None]:
import plotly.express as px

# Assuming you want to count occurrences of combinations
heatmap_data = heatmap_.groupby(['target_second_category', 'one_last_category']).size().reset_index(name='Count')

# Create heatmap
fig = px.imshow(pd.pivot_table(heatmap_data, values='Count', index='target_second_category', columns='one_last_category', fill_value=0),
                labels=dict(x='one_last_category', y='target_second_category', color='Count'),
                x=heatmap_data['one_last_category'].unique(),
                y=heatmap_data['target_second_category'].unique())

# Show the plot
fig.show()


In [None]:
path_dict = {}
for category in people_main_categories.index:
    category_path = [path for path in finished_path_people.four_last if path[-1]==category]
    path_dict[category]= category_path

In [None]:
def get_dict_from_list(path_list):

    N0 = pd.DataFrame({'category':[path[-1] for path in path_list if len(path)>0]}).value_counts().to_frame().reset_index()
    N1 = pd.DataFrame({'category':[path[-2] for path in path_list if len(path)>1]}).value_counts().to_frame().reset_index()
    N2 = pd.DataFrame({'category':[path[-3] for path in path_list if len(path)>2]}).value_counts().to_frame().reset_index()
    N3 = pd.DataFrame({'category':[path[-4] for path in path_list if len(path)>3]}).value_counts().to_frame().reset_index()
    long_paths = [path for path in path_list if len(path) > 0]

    return {'list':path_list, 'long_path_list': long_paths, 'N0': N0, 'N1': N1, 'N2': N2, 'N3': N3}

def update_dict_with_counts(path_dict):
    new_path_dict = {}
    for cat in path_dict.keys():
        new_path_dict[cat] = get_dict_from_list(path_dict[cat])
    return new_path_dict

new_path_dict = update_dict_with_counts(path_dict)



In [None]:
# color_dict = {'Science':'#48b059', 'Geography':'#3d8039', 'People':'#b44624','History':'#ff8544','Everyday_life':'#e25d90', 'Design_and_Technology':'#8b1e9c','Countries':'#a7d145','Citizenship':'#fced3e', 'Language_and_literature':'#6bc6ff', 'Religion':'#1d59a8', 'Music':'#ffa7dc', 'Business_Studies':'#ffec6f','IT':'#008756','Mathematics':'#6bffce','Art':'#e0025e'}

fig,axes = plt.subplots(8,2, figsize=(50,100))
axes = axes.flatten()

i=0
for cat in path_dict.keys():
    line_width = 50/len(path_dict[cat])
    for path in path_dict[cat]:
        x_path = np.arange(start = 4 - len(path), stop = 4, step = 1)
        axes[i].plot(x_path, path, alpha = line_width)
    axes[i].set_title(cat, size = 20)
    axes[i].set_xticks(np.arange(4), ['N-3', 'N-2', 'N-1', 'N'])
    axes[i].tick_params(labelsize = 18)
    i+=1
plt.tight_layout()
plt.show()

In [None]:
plt.plot(finished_path_people['3_last'])

In [None]:
path_dict.keys()

In [None]:
color_dict = {'Historical_figures':'#48b059', 'Writers_and_critics':'#3d8039', 'Political_People':'#b44624','Sports_and_games_people':'#ff8544','Performers_and_composers':'#e25d90', 'Monarchs_of_Great_Britain':'#8b1e9c','Religious_figures_and_leaders':'#a7d145','USA_Presidents':'#fced3e', 'Artists':'#6bc6ff', 'Engineers_and_inventors':'#1d59a8', 'Actors_models_and_celebrities':'#ffa7dc', 'Human_Scientists':'#ffec6f','Military_People':'#008756','Mathematics':'#6bffce','Art':'#e0025e'}

fig,axes = plt.subplots(8,2, figsize=(50,100))
axes = axes.flatten()

i=0
for cat in path_dict.keys():
    line_width = 50/len(path_dict[cat])
    for path in path_dict[cat]:
        x_path = np.arange(start = 4 - len(path), stop = 4, step = 1)
        axes[i].plot(x_path, path,  alpha = line_width)
    axes[i].set_title(cat, size = 20)
    axes[i].set_xticks(np.arange(4), ['N-3', 'N-2', 'N-1', 'N'])
    axes[i].tick_params(labelsize = 18)
    i+=1
plt.tight_layout()
plt.show()