In [1]:
import seaborn as sn
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.stats import kurtosis
import ipywidgets as widgets
from IPython.display import display
import plotly.graph_objects as go

from src.utils.HTMLParser import HTMLParser
from src.data.data_loader import *
from src.utils.helpers import *
from src.models.networks import *

from src.models.similarity_matrices import *

parser = HTMLParser()
parser.load_pickle()

In [12]:
df_article_names = read_articles() 
df_html_stats = parser.get_df_html_stats()
df_categories = read_categories()
df_links = read_links()
df_shortest_path = read_shortest_path_matrix()
df_unfinished = read_unfinished_paths()
df_finished = read_finished_paths() 
df_sm = read_similartiy_matrix() 
df_scat = read_categories_matrix()

Unfinished Paths
---------------- 
Number of rows before filtering: 24875
Invalid target articles found: {'Fats', 'Macedonia', 'Usa', 'Bogota', 'Podcast', 'Sportacus', ' Zebra', 'Rss', 'The', 'Great', 'Adolph Hitler', 'Black ops 2', 'Test', 'Long peper', 'Pikachu', 'Charlottes web', 'Christmas', 'Western Australia', 'Rat', 'Netbook', 'English', 'Mustard', 'Georgia', 'Kashmir', 'The Rock'}
Invalid articles found in path: {'Private Peaceful', 'Wikipedia Text of the GNU Free Documentation License', 'Pikachu', 'Osteomalacia', 'Local community'}
Number of 'timeout' games with a duration of less than 30 minutes: 7
Number of rows after filtering: 24709 

Finished Paths
-------------- 
Number of rows before filtering: 51318
Invalid articles found in path: {'Pikachu', 'Osteomalacia', 'Wikipedia Text of the GNU Free Documentation License', 'Local community'}
Number of rows after filtering: 51196


- **Only parse one time to save the parsed_information into the pickle file**  
Avoid do ``parser.parse_all()`` at each call of ``mean_link_position_per_category()`` function

In [3]:
# indices = [i for i in parser.article_names.index if parser.article_names[i] in df_article_names.values]
# parser.parse_selection(indices)
# parser.save_pickle()

100%|██████████| 4584/4584 [08:34<00:00,  8.90it/s]


In [16]:
#DEFINE A NEW COLOR PALETTE TO HIGHLIGHT COUNTRY AND CATEGORIES, and add a possible color 'Others'
categories_others = ['Art',
 'Business Studies',
 'Citizenship',
 'Countries',
 'Design and Technology',
 'Everyday life',
 'Geography',
 'History',
 'IT',
 'Language and literature',
 'Mathematics',
 'Music',
 'People',
 'Religion',
 'Science',
 'Others',]

# colors for country and geo 
highlight_colors = {'Countries': '#2CB5AE','Geography': '#16A2F3'}

# shades of grey for other categories
num_greys = len(categories_others) - 2  # - country and geo 
grey_shades = [mcolors.to_hex((v, v, v)) for v in np.linspace(0.2, 0.4, num_greys)]
non_custom_categories = [cat for cat in categories_others if cat not in highlight_colors]
grey_palette = dict(zip(non_custom_categories, grey_shades)) ##here zip with a new 

# Combine custom colors and grey palette
palette_category_dict = {**highlight_colors, **grey_palette}

In [4]:
df_article = pd.DataFrame(df_article_names).copy()

# Compute in-degree (number of times each article is a target link)
in_degree = df_links.groupby('linkTarget').size().reset_index(name="in_degree")
# Compute out-degree (link density: number of times each article is a source link)
out_degree = df_links.groupby('linkSource').size().reset_index(name="out_degree")

# Merge in-degree and out-degree with df_article_names
df_article = df_article.merge(in_degree, left_on='article', right_on='linkTarget', how='left')
df_article = df_article.merge(out_degree, left_on='article', right_on='linkSource', how='left')
df_article = df_article.drop(columns=['linkTarget', 'linkSource'])

# Fill NaN values with 0, assuming no links imply zero counts for those articles
df_article = df_article.fillna(0).astype({'in_degree': 'int', 'out_degree': 'int'})

# add the html stats to the articles
df_html_stats = df_html_stats.rename(columns={'article_name': 'article'})
df_article = pd.merge(df_article, df_html_stats, how='inner')

# add the category (level_1) to each articles
category_map = dict(zip(df_categories["article"], df_categories["level_1"]))
df_article["category"] = df_article["article"].map(category_map)

In [5]:
df_article = pd.DataFrame(df_article_names).copy()

# Compute in-degree (number of times each article is a target link)
in_degree = df_links.groupby('linkTarget').size().reset_index(name="in_degree")
# Compute out-degree (link density: number of times each article is a source link)
out_degree = df_links.groupby('linkSource').size().reset_index(name="out_degree")

# Merge in-degree and out-degree with df_article_names
df_article = df_article.merge(in_degree, left_on='article', right_on='linkTarget', how='left')
df_article = df_article.merge(out_degree, left_on='article', right_on='linkSource', how='left')
df_article = df_article.drop(columns=['linkTarget', 'linkSource'])

# Fill NaN values with 0, assuming no links imply zero counts for those articles
df_article = df_article.fillna(0).astype({'in_degree': 'int', 'out_degree': 'int'})

# add the html stats to the articles
df_html_stats = df_html_stats.rename(columns={'article_name': 'article'})
df_article = pd.merge(df_article, df_html_stats, how='inner')

# add the category (level_1) to each articles
category_map = dict(zip(df_categories["article"], df_categories["level_1"]))
df_article["category"] = df_article["article"].map(category_map)

In [6]:
# let's add some useful metrics to each paths dataframe: shortest path, semantic similarity
df_unfinished['cosine_similarity'] = df_unfinished.apply(lambda x: find_shortest_distance(x, df_sm), axis=1)
df_unfinished['shortest_path'] = df_unfinished.apply(lambda x: find_shortest_distance(x, df_shortest_path), axis=1)
df_unfinished['path_length'] = df_unfinished['path'].apply(lambda x: x.count(';') + 1)
df_unfinished['back_clicks'] = df_unfinished['path'].apply(lambda x: x.count('<'))
df_unfinished['categories_similarity'] = df_unfinished.apply(lambda x: find_shortest_distance(x, df_scat), axis=1)

df_finished['cosine_similarity'] = df_finished.apply(lambda x: find_shortest_distance(x, df_sm), axis=1)
df_finished['shortest_path'] = df_finished.apply(lambda x: find_shortest_distance(x, df_shortest_path), axis=1)
df_finished['path_length'] = df_finished['path'].apply(lambda x: x.count(';') + 1)
df_finished['back_clicks'] = df_finished['path'].apply(lambda x: x.count('<'))
df_finished['categories_similarity'] = df_finished.apply(lambda x: find_shortest_distance(x, df_scat), axis=1)

## Where is Geography and Country link in a page ?

In [7]:
from tqdm import tqdm

def mean_link_position_per_category(parser, df_categories, category= ["Country", "Geography"]) : 
    
    #parser.parse_all()
    articles_links = {article: data["total_links"] for article, data in parser.parsed_articles.items()}
    article_to_category = dict(zip(df_categories['article'], df_categories['level_1']))
    articles_links_voyage = {k: [v_select for v_select in v if v_select in article_to_category.keys() and article_to_category[v_select] in category] for k, v in articles_links.items()}
    position_voyage = []
    for article, voyage_list in tqdm(articles_links_voyage.items()):
        position = []
        for a in voyage_list:
            info = parser.find_link_positions(article, a)
            position.append(info['article_link_position'][0]/info['total_links'] if len(info['article_link_position']) != 0 else np.NaN)
        position_voyage.append(np.mean(position))
    return position_voyage

In [21]:
pd.DataFrame(mean_link_position_per_category(parser, df_categories, category=["People"])).describe()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 4599/4599 [00:22<00:00, 205.39it/s]


Unnamed: 0,0
count,2681.0
mean,0.509034
std,0.233797
min,0.020833
25%,0.333333
50%,0.5
75%,0.673077
max,1.0


In [22]:
link_per_cat = {}

for category in categories_others:
    print(category)
    link_per_cat[category] = mean_link_position_per_category(parser, df_categories, category=[category])

Art


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 4599/4599 [00:01<00:00, 3596.15it/s]


Business Studies


100%|██████████| 4599/4599 [00:04<00:00, 1106.62it/s]


Citizenship


100%|██████████| 4599/4599 [00:14<00:00, 314.76it/s]


Countries


100%|██████████| 4599/4599 [00:34<00:00, 132.49it/s]


Design and Technology


100%|██████████| 4599/4599 [00:04<00:00, 965.12it/s] 


Everyday life


100%|██████████| 4599/4599 [00:14<00:00, 322.94it/s]


Geography


100%|██████████| 4599/4599 [01:59<00:00, 38.51it/s] 


History


100%|██████████| 4599/4599 [00:22<00:00, 205.05it/s]


IT


100%|██████████| 4599/4599 [00:02<00:00, 2149.10it/s]


Language and literature


100%|██████████| 4599/4599 [00:11<00:00, 415.91it/s]


Mathematics


100%|██████████| 4599/4599 [00:01<00:00, 2461.85it/s]


Music


100%|██████████| 4599/4599 [00:02<00:00, 1797.81it/s]


People


100%|██████████| 4599/4599 [00:23<00:00, 194.28it/s]


Religion


100%|██████████| 4599/4599 [00:10<00:00, 454.68it/s]


Science


100%|██████████| 4599/4599 [00:42<00:00, 108.90it/s]


Others


100%|██████████| 4599/4599 [00:00<00:00, 50301.46it/s]


In [23]:
df = pd.DataFrame(link_per_cat)
df.describe()

Unnamed: 0,Art,Business Studies,Citizenship,Countries,Design and Technology,Everyday life,Geography,History,IT,Language and literature,Mathematics,Music,People,Religion,Science,Others
count,372.0,1093.0,2133.0,2224.0,1241.0,2122.0,4159.0,2566.0,416.0,1958.0,347.0,414.0,2681.0,1416.0,3005.0,0.0
mean,0.500919,0.459936,0.472866,0.51426,0.518863,0.572223,0.465889,0.470308,0.586041,0.417649,0.507558,0.564922,0.509034,0.511915,0.462983,
std,0.271207,0.277006,0.250323,0.243503,0.258508,0.268075,0.1811,0.220326,0.266998,0.285513,0.260628,0.276636,0.233797,0.261169,0.210527,
min,0.013889,0.003279,0.007353,0.012195,0.013514,0.005051,0.011905,0.009836,0.053521,0.006061,0.013158,0.009615,0.020833,0.018519,0.006757,
25%,0.281724,0.230233,0.279481,0.327845,0.307692,0.363636,0.341531,0.308528,0.382003,0.171054,0.303301,0.322046,0.333333,0.294118,0.328571,
50%,0.5,0.434783,0.447368,0.502727,0.510204,0.588132,0.444444,0.444444,0.592215,0.375,0.483333,0.559028,0.5,0.482759,0.437821,
75%,0.721285,0.666667,0.645354,0.697146,0.717557,0.8,0.571132,0.608817,0.809921,0.630049,0.718254,0.833333,0.673077,0.719618,0.582143,
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,


## Where do people click on a page ?

In [11]:
df_finished['position'] = np.NaN

for i in range(len(df_finished)):
    articles = df_finished['path'][i].split(';')
    
    position = []
    for a in range(len(articles)-1):
        if articles[a+1] == '<' or articles[a] == '<':
            continue
        else:
            info = parser.find_link_positions(articles[a], articles[a+1])
            position.append(info['article_link_position'][0]/info['total_links'] if len(info['article_link_position']) != 0 else np.NaN)
    df_finished.loc[i, 'position'] = np.mean(position)

In [25]:
df_finished["position"].describe()

count    51189.000000
mean         0.337180
std          0.151484
min          0.004376
25%          0.228583
50%          0.326736
75%          0.432948
max          1.000000
Name: position, dtype: float64

In [None]:
df_unfinished['position'] = np.NaN

for i in range(len(df_unfinished)):
    articles = df_unfinished['path'][i].split(';')
    
    position = []
    for a in range(len(articles)-1):
        if articles[a+1] == '<' or articles[a] == '<':
            continue
        else:
            info = parser.find_link_positions(articles[a], articles[a+1])
            position.append(info['article_link_position'][0]/info['total_links'] if len(info['article_link_position']) != 0 else np.NaN)
    df_unfinished.loc[i, 'position'] = np.mean(position)

In [None]:
df_unfinished["position"].describe()

In [None]:
all_paths = pd.concat([df_finished, df_unfinished])
all_paths["position"].describe()

In [73]:
all_paths["category"] = "All"
all_paths["Legend :"] = "Clicked Link Position in Paths"
df_melted = pd.melt(df, var_name='category', value_name='position').dropna()
df_melted["Legend :"] = "Article Link Position in Articles"
df_comparison_path_category = pd.concat([all_paths[["category", "position", "Legend :"]], df_melted])


In [None]:
df_comparison_path_category

In [None]:
import plotly.express as px
fig = px.box(df_comparison_path_category, x="category", y="position", color="Legend :", title="Position of the clicked link in articles compared to position of each category in articles")
fig.update_xaxes(tickangle=45)

fig.update_layout(
    autosize=False,
    width=1500,
    height=500,
    boxgroupgap=0.2, # update
    boxgap=0)
fig.show()