# Project Milestone 3

# Blue click, Red click: Decoding the matrix behind the political bias of Wikispeedia 

## Import Libraries 

In [1]:
import pandas as pd
import tarfile
import gzip
import matplotlib.pyplot as plt
import seaborn as sns
import urllib.parse
import requests
from bs4 import BeautifulSoup 
import os
from Functions import *
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import mplcursors
import mpldatacursor
from math import pi
from plotly.subplots import make_subplots
import statsmodels.api as sm

## Download the data and extract it

In [2]:
value=download_extract_data() #Put the data in the data folder that is ignored to github push

Repository data already exist


## 1. Load and Filter of the used data : 
## Un/finished path, Categories, Plain texts and Results

### Finished and Unfinished Path

In [3]:
#define paths
PATH_FINISHED="./data/wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/paths_finished.tsv"
PATH_UNFINISHED = "./data/wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/paths_unfinished.tsv"

#finished path data
finished_path=pd.read_csv(PATH_FINISHED, header=14, delimiter="\t", names=["hashedIpAddress","timestamp","durationInSec","path","rating"])
#convert timestamp to datetime
finished_path["timestamp"]=pd.to_datetime(finished_path["timestamp"], unit ='s')

#unfinished path data
unfinished_path=pd.read_csv(PATH_UNFINISHED, header=16, delimiter="\t", names=["hashedIpAddress","timestamp","durationInSec","path_start","path_goal","rating"])
#convert timestamp to datetime
unfinished_path["timestamp"]=pd.to_datetime(unfinished_path["timestamp"], unit ='s')

unfinished_path = unfinished_path.rename(columns={'path_start' : 'path'})

finished_path.head()
unfinished_path.head()
print(f"Finished paths : First date {finished_path.timestamp.dt.date.min()} \t Last Date {finished_path.timestamp.dt.date.max()} ")
print(f"Unfinished paths : First date {unfinished_path.timestamp.dt.date.min()}  Last Date {unfinished_path.timestamp.dt.date.max()} ")


Finished paths : First date 2008-08-18 	 Last Date 2014-01-15 
Unfinished paths : First date 2011-02-07  Last Date 2014-01-15 


### Categories Dataset

In [4]:
PATH_CATEGORIES = "./data/wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/categories.tsv"

categories = pd.read_csv(PATH_CATEGORIES, header = 12, delimiter='\t',names=["article","categories"])

categories['article'] = categories['article'].apply(lambda x :(urllib.parse.unquote(x , encoding = 'utf-8'))) # Decode article names

people = categories[categories['categories'].str.contains('subject.People')].copy() # Dataset of people
people['categories'] = people['categories'].apply(lambda x : x[len('subject.People.'):]) # Remove useless string

counts = people.groupby('categories').count()
counts = counts.reset_index()
counts.columns = ["Name", "Count"]
counts=counts.sort_values("Count", ascending=True)

fig = px.bar(counts, x="Count", y="Name", orientation='h')
fig.update_layout(
    title="Number of article for each people's category",
    margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()

pas dans la data story

### Plain text articles

In [5]:
plain_articles=load_plain_article()

### Results csv

In [6]:
#Political People classification party 
politicien = pd.read_csv('results/political_affiliation.csv', delimiter= ';')

#Political Bias in Text
bias = pd.read_csv('results/political_bias.csv')

## 2. General analysis (Milestone 2)

### Articles without any category

In [7]:
titles_categories = set(categories['article'])
titles_plain_text = set(plain_articles['title'])

#Unique title that doesn't appear in both dataframe
unique_titles = (titles_categories.symmetric_difference(titles_plain_text))

#Result
print("There is %d articles that doesn't appear in the categories data : \n%s"%(len(unique_titles), unique_titles))

There is 6 articles that doesn't appear in the categories data : 
{'Friend_Directdebit', 'Sponsorship_Directdebit', 'Directdebit', 'Pikachu', 'Donation', 'Wowpurchase'}


## Analyse of the un/finished path 

### Finished path, defining hubs, paths and internal path : without the beggining and finishing page

In [8]:
#Concatenate everything into a single character string
all_paths = finished_path['path'].apply(lambda x: x.split(';'))

#Keeping the start and end path and transforming into a dictionnary 
paths_flat = [path for sublist in all_paths for path in sublist]
paths_count = pd.Series(paths_flat).value_counts().to_dict()
paths_count.pop("<") # we remove the backward symbol


#Get rid of the first and last element
internal_paths = all_paths.apply(lambda x: x[1:-1] if len(x) > 2 else [])
internal_paths_flat = [path for sublist in internal_paths for path in sublist]
internal_paths_count = pd.Series(internal_paths_flat).value_counts().to_dict()
internal_paths_count.pop("<")

#evaluate the first and last page selection:
#df_start_end=pd.merge(df_paths_count,df_internal_paths_count,on="title", how="left")

print('counts for the finished paths:',paths_count)

print('counts for the internal finished paths:',internal_paths_count)

counts for the finished paths: {'United_States': 8896, 'Europe': 4362, 'United_Kingdom': 3904, 'England': 3332, 'Earth': 3223, 'Africa': 2796, 'World_War_II': 2301, 'North_America': 1884, 'Germany': 1769, 'Animal': 1713, 'Human': 1642, 'Mammal': 1622, 'France': 1617, 'Computer': 1552, 'Science': 1497, 'English_language': 1430, 'Periodic_table': 1413, 'Brain': 1324, 'Atlantic_Ocean': 1324, 'Telephone': 1252, 'India': 1236, 'Bird': 1210, 'Viking': 1198, 'Agriculture': 1185, 'Plant': 1180, 'Asia': 1176, 'Asteroid': 1174, 'China': 1126, 'Christianity': 1097, 'Japan': 1086, 'United_Nations': 1064, 'Internet': 1047, 'Zebra': 1042, 'Theatre': 1038, 'Russia': 1019, 'Australia': 1003, 'Chemistry': 991, 'London': 976, 'Sun': 932, 'Canada': 922, 'South_America': 920, 'Water': 905, 'European_Union': 891, 'Adolf_Hitler': 883, 'Solar_System': 880, 'Italy': 867, 'President_of_the_United_States': 850, 'Physics': 848, 'Pacific_Ocean': 844, 'Chemical_element': 841, 'Computer_science': 834, 'Biology': 80

### Same for the unfinished paths


In [9]:
all_paths_unfinished = unfinished_path['path'].apply(lambda x: x.split(';'))

#Keeping the start and end path and transforming into a dictionnary 
paths_flat_unfinished = [path for sublist in all_paths_unfinished for path in sublist]
paths_count_unfinished = pd.Series(paths_flat_unfinished).value_counts().to_dict()
paths_count_unfinished.pop("<") # we remove the backward symbol


#Get rid of the first and last element
internal_paths_unfinished= all_paths_unfinished.apply(lambda x: x[1:-1] if len(x) > 2 else [])
internal_paths_flat_unfinished = [path for sublist in internal_paths_unfinished for path in sublist]
internal_paths_count_unfinished = pd.Series(internal_paths_flat_unfinished).value_counts().to_dict()
internal_paths_count_unfinished.pop("<")

print('counts for the unfinished paths',paths_count_unfinished)

print('counts for the internal unfinished paths',internal_paths_count_unfinished)

counts for the unfinished paths {'United_States': 3553, 'United_Kingdom': 1424, 'Europe': 1249, 'England': 1226, 'Brain': 1100, 'Earth': 993, 'Pyramid': 961, 'Theatre': 932, 'Africa': 794, 'Asteroid': 768, 'Human': 761, 'Animal': 758, 'Mammal': 713, 'North_America': 710, 'France': 631, 'World_War_II': 630, 'English_language': 604, 'Agriculture': 561, 'India': 541, 'Plant': 528, 'Mexico': 517, 'Film': 474, 'Internet': 463, 'Germany': 457, 'Atlantic_Ocean': 435, 'London': 431, 'Science': 428, 'Christianity': 414, 'Computer': 409, 'Computer_science': 382, 'California': 380, 'Television': 374, 'Bird': 372, 'Culture': 370, 'Italy': 361, 'Batman': 360, 'Music': 359, 'United_Nations': 346, 'China': 338, 'Scotland': 335, 'Great_Britain': 328, 'Biology': 327, 'Russia': 323, 'Asia': 321, 'Japan': 319, 'Canada': 298, 'Australia': 297, 'Roman_Empire': 297, '20th_century': 292, 'Latin': 290, 'Pacific_Ocean': 290, 'Sport': 289, 'Fish': 288, 'British_Empire': 286, 'Technology': 285, 'Food': 280, 'New

### See if all articles of our dataset are used in the path dataset 

In [10]:
print('Number of articles used in finished path wikispeedia : %d'%(len(paths_count)))
print('Number of articles used in unfinished path wikispeedia : %d'%(len(paths_count_unfinished)))
print("Number of articles in the total Dataset : %d"%(len(plain_articles)))

if len(paths_count) != len(plain_articles):
    print("There are %d articles that are not used for the finished paths"%(len(plain_articles)-len(paths_count)))
else:
    print("All articles are used in the finished path of the users")

if len(paths_count_unfinished) != len(plain_articles):
    print("There are %d articles that are not used for the unfinished paths"%(len(plain_articles)-len(paths_count_unfinished)))
else:
    print("All articles are used in the unfinished path of the users")

Number of articles used in finished path wikispeedia : 4169
Number of articles used in unfinished path wikispeedia : 4061
Number of articles in the total Dataset : 4604
There are 435 articles that are not used for the finished paths
There are 543 articles that are not used for the unfinished paths


### Graph of the access of used pages for finished paths

In [11]:
pages = list(paths_count.keys())
counts = list(paths_count.values())
num_pages = len(pages)
page_indices = list(range(num_pages))


fig = go.Figure()
fig.add_trace(go.Bar(
    x=page_indices,
    y=counts,
    marker=dict(color='blue'),
    hovertemplate='<b>Number of pages:</b> %{x}<br><b>Number of access:</b> %{y}<extra></extra>'
))

fig.update_layout(
    xaxis=dict(title='Number of pages'),
    yaxis=dict(title='Number of access in a log scale', type='log'),
    title='Number of page access in finished paths (log scale)',
    width=900,
    height=500,
    plot_bgcolor='rgba(240, 240, 240, 0.7)', 
    paper_bgcolor='lightgrey',  
)

fig.show()

pas dans la data story

### Same for the unfinished paths

In [12]:
pages_unfinished = list(paths_count_unfinished.keys())
counts_unfinished = list(paths_count_unfinished.values())
num_pages_unfinished = len(pages_unfinished)
page_indices_unfinished = list(range(num_pages_unfinished))


fig = go.Figure()
fig.add_trace(go.Bar(
    x=page_indices_unfinished,
    y=counts_unfinished,
    marker=dict(color='blue'),
    hovertemplate='<b>Number of pages:</b> %{x}<br><b>Number of access:</b> %{y}<extra></extra>'
))

fig.update_layout(
    xaxis=dict(title='Number of pages'),
    yaxis=dict(title='Number of access in a log scale', type='log'),
    title='Number of page access in unfinished paths (log scale)',
    width=900,
    height=500,
    plot_bgcolor='rgba(240, 240, 240, 0.7)', 
    paper_bgcolor='lightgrey',  
)

fig.show()

pas dans la data story

### Top 25 of hubs for finished and unfinished path <span style="color:red">(Supprimer ?)</span>

In [13]:
top_25_fin_hubs = dict(sorted(paths_count.items(), key=lambda item: item[1], reverse=True)[:25])
top_25_unfin_hubs = dict(sorted(paths_count_unfinished.items(), key=lambda item: item[1], reverse=True)[:25])

In [14]:
pages_fin = list(top_25_fin_hubs.keys())
counts_fin = list(top_25_fin_hubs.values())
pages_unfin = list(top_25_unfin_hubs.keys())
counts_unfin = list(top_25_unfin_hubs.values())


fig = make_subplots(rows=2, cols=1, subplot_titles=('Top 25 pages in finished path', 'Top 25 pages in unfinished path'))
fig.add_trace(go.Bar(x=pages_fin, y=counts_fin, marker=dict(color='blue'), name='Number of access',
                     hovertemplate='Number of access: %{y}<extra></extra>'), row=1, col=1)
fig.add_trace(go.Bar(x=pages_unfin, y=counts_unfin, marker=dict(color='red'), name='Number of access',
                     hovertemplate='Number of access: %{y}<extra></extra>'), row=2, col=1)
fig.update_layout(
    height=1000,
    width=900,
    showlegend=False  
)

fig.update_xaxes(title_text='Title page', row=1, col=1)
fig.update_xaxes(title_text='Title page', row=2, col=1)
fig.update_yaxes(title_text='Number of access', row=1, col=1)
fig.update_yaxes(title_text='Number of access', row=2, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=1, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=2, col=1)

fig.show()


pas dans la data story

We can see that a lots of hubs are used in both un/finished path but some are new (for exemple "Brain", "pyramid" and other ones), note that the number of acces in the unfinished path is lower than the finished path one (less data for unfinished paths data). 

## Political people representation in the un/finished paths

### Political people representation in the finished paths

In [15]:
#Hubs with all paths for finished one (including start and end articles)
all_hubs_fin = dict(sorted(paths_count.items(), key=lambda item: item[1], reverse=True))

hubs_people_fin = {key: value for key, value in all_hubs_fin.items() if key in people['article'].tolist()}
hubs_political_people_fin = {key: value for key, value in hubs_people_fin.items() if key in people[people['categories'] == 'Political_People']['article'].tolist()}

#Hubs with only the pathing for finished one (excluding start and end articles)
all_hubs_internal_fin = dict(sorted(internal_paths_count.items(), key=lambda item: item[1], reverse=True))

hubs_people_internal_fin = {key: value for key, value in all_hubs_internal_fin.items() if key in people['article'].tolist()}
hubs_political_people_internal_fin = {key: value for key, value in hubs_people_internal_fin.items() if key in people[people['categories'] == 'Political_People']['article'].tolist()}

In [16]:
# full path
pages_fin = list(hubs_political_people_fin.keys())
counts_fin = list(hubs_political_people_fin.values())

# internal path
pages2_fin = list(hubs_political_people_internal_fin.keys())
counts2_fin = list(hubs_political_people_internal_fin.values())

# difference between the two sets 
common_keys_fin = set(hubs_political_people_fin.keys()) & set(hubs_political_people_internal_fin.keys())
diff_items_fin = {key: hubs_political_people_fin[key] - hubs_political_people_internal_fin[key] for key in common_keys_fin}
diff_items_fin = dict(sorted(diff_items_fin.items(), key=lambda item: item[1], reverse=True))
diff_values_fin = list(diff_items_fin.values())
median_diff_fin = np.median(diff_values_fin)

fig = make_subplots(rows=3, cols=1, subplot_titles=(
    'Political people Pages most present in finished paths',
    'Political people Pages most present in the inner finished paths',
    'Difference in access between the two sets'
))
fig.add_trace(go.Bar(x=pages_fin, y=counts_fin, marker=dict(color='blue'), name='Number of access',
                     hovertemplate='Number of access: %{y}<extra></extra>'), row=1, col=1)
fig.add_trace(go.Bar(x=pages2_fin, y=counts2_fin, marker=dict(color='red'), name='Number of access',
                     hovertemplate='Number of access: %{y}<extra></extra>'), row=2, col=1)
fig.add_trace(go.Bar(x=list(diff_items_fin.keys()), y=list(diff_items_fin.values()), marker=dict(color='green'),
                     name='Difference in access',
                     hovertemplate='Difference in access: %{y}<extra></extra>'), row=3, col=1)
fig.update_layout(
    height=2000,
    width=900,
    showlegend=False  
)

fig.update_xaxes(title_text='Pages', row=1, col=1)
fig.update_xaxes(title_text='Pages', row=2, col=1)
fig.update_xaxes(title_text='Pages', row=3, col=1)
fig.update_yaxes(title_text='Number of access', row=1, col=1)
fig.update_yaxes(title_text='Number of access', row=2, col=1)
fig.update_yaxes(title_text='Difference in number of accesses', row=3, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=1, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=2, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=3, col=1)
fig.show()

print('Median difference between the full path and the inner path for the finished paths: ', median_diff_fin)


Median difference between the full path and the inner path for the finished paths:  20.0


The median here corresponds to the randomness in the choice of the starting and ending pages. Here we can see that each political article page is choosen around 20 times randomly by the wikispeedia algorithm. 
This can allow us to highlight the political pages choosen by the user to start or to end their games.
For some political people, there is a strong influence of the human on the selection of the page for starting or ending the wikispeedia game

### Political people representation in the unfinished user paths

In [17]:
#Hubs with all paths for finished one (including start and end articles)
all_hubs_unfin = dict(sorted(paths_count_unfinished.items(), key=lambda item: item[1], reverse=True))

hubs_people_unfin = {key: value for key, value in all_hubs_unfin.items() if key in people['article'].tolist()}
hubs_political_people_unfin = {key: value for key, value in hubs_people_unfin.items() if key in people[people['categories'] == 'Political_People']['article'].tolist()}

#Hubs with only the pathing for finished one (excluding start and end articles)
all_hubs_internal_unfin = dict(sorted(internal_paths_count_unfinished.items(), key=lambda item: item[1], reverse=True))

hubs_people_internal_unfin = {key: value for key, value in all_hubs_internal_unfin.items() if key in people['article'].tolist()}
hubs_political_people_internal_unfin = {key: value for key, value in hubs_people_internal_unfin.items() if key in people[people['categories'] == 'Political_People']['article'].tolist()}

In [18]:
# full path
pages_unfin = list(hubs_political_people_unfin.keys())
counts_unfin = list(hubs_political_people_unfin.values())

# internal path
pages2_unfin = list(hubs_political_people_internal_unfin.keys())
counts2_unfin = list(hubs_political_people_internal_unfin.values())

# difference between the two sets 
common_keys_unfin = set(hubs_political_people_unfin.keys()) & set(hubs_political_people_internal_unfin.keys())
diff_items_unfin = {key: hubs_political_people_unfin[key] - hubs_political_people_internal_unfin[key] for key in common_keys_unfin}
diff_items_unfin = dict(sorted(diff_items_unfin.items(), key=lambda item: item[1], reverse=True))
diff_values_unfin = list(diff_items_unfin.values())
median_diff_unfin = np.median(diff_values_unfin)

fig = make_subplots(rows=3, cols=1, subplot_titles=(
    'Political people Pages most present in unfinished paths',
    'Political people Pages most present in the inner unfinished paths',
    'Difference in access between the two sets'
))
fig.add_trace(go.Bar(x=pages_unfin, y=counts_unfin, marker=dict(color='blue'), name='Number of access',
                    hovertemplate='Number of access: %{y}<extra></extra>'), row=1, col=1)
fig.add_trace(go.Bar(x=pages2_unfin, y=counts2_unfin, marker=dict(color='red'), name='Number of access',
                    hovertemplate='Number of access: %{y}<extra></extra>'), row=2, col=1)
fig.add_trace(go.Bar(x=list(diff_items_unfin.keys()), y=list(diff_items_unfin.values()), marker=dict(color='green'),
                    name='Difference in access',
                    hovertemplate='Difference in access: %{y}<extra></extra>'), row=3, col=1)
fig.update_layout(
    height=2000,
    width=900,
    showlegend=False 
)

fig.update_xaxes(title_text='Pages', row=1, col=1)
fig.update_xaxes(title_text='Pages', row=2, col=1)
fig.update_xaxes(title_text='Pages', row=3, col=1)
fig.update_yaxes(title_text='Number of access', row=1, col=1)
fig.update_yaxes(title_text='Number of access', row=2, col=1)
fig.update_yaxes(title_text='Difference in number of accesses', row=3, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=1, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=2, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=3, col=1)
fig.show()

print('Median difference between the full path and the inner path for the unfinished paths: ', median_diff_unfin)

Median difference between the full path and the inner path for the unfinished paths:  7.0


The median here corresponds to the randomness in the choice of the starting and ending pages. Here we can see that each political article page is choosen around 7 times randomly by the wikispeedia algorithm. 
This can allow us to highlight the political pages choosen by the user to start or to end their games.
For some political people, there is a strong influence of the human on the selection of the page for starting or ending the wikispeedia game

<span style="color:red">On doit creuser bien plus pour montrer que ce n'est pas du au hasard --> faire une analyse plus approfondie </span>

### Difference of entry/finish between all articles in the finished and unfinished paths <span style="color:red">(Supprimer ?)</span>

In [19]:
path_diff_dict_fin = {}
path_diff_dict_unfin = {}

# create a dict to store the difference between the two sets
for key in paths_count:
    if key in internal_paths_count:
        path_diff_dict_fin[key] = paths_count[key] - internal_paths_count[key]
for key in paths_count_unfinished:
    if key in internal_paths_count_unfinished:
        path_diff_dict_unfin[key] = paths_count_unfinished[key] - internal_paths_count_unfinished[key]

sorted_diff_fin = sorted(path_diff_dict_fin.items(), key=lambda x: abs(x[1]), reverse=True)
sorted_diff_fin = dict(sorted_diff_fin[0:30])
sorted_diff_unfin = sorted(path_diff_dict_unfin.items(), key=lambda x: abs(x[1]), reverse=True)
sorted_diff_unfin = dict(sorted_diff_unfin[0:30])

fig = make_subplots(rows=2, cols=1, subplot_titles=(
    'Top 30 pages that the first or last page are used for the finished path',
    'Top 30 pages that the first or last page are used for the unfinished path'
))
fig.add_trace(go.Bar(x=list(sorted_diff_fin.keys()), y=list(sorted_diff_fin.values()), marker=dict(color='blue'),
                     name='Difference in access', hovertemplate='Difference in access: %{y}<extra></extra>'), row=1, col=1)
fig.add_trace(go.Bar(x=list(sorted_diff_unfin.keys()), y=list(sorted_diff_unfin.values()), marker=dict(color='red'),
                     name='Difference in access', hovertemplate='Difference in access: %{y}<extra></extra>'), row=2, col=1)
fig.update_layout(
    height=1200,
    width=900,
    showlegend=False  
)

fig.update_xaxes(title_text='Pages', row=1, col=1)
fig.update_xaxes(title_text='Pages', row=2, col=1)
fig.update_yaxes(title_text='Difference in number of accesses', row=1, col=1)
fig.update_yaxes(title_text='Difference in number of accesses', row=2, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=1, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=2, col=1)

fig.show()

## Classification of Political Personalities

In [20]:
politicien.head(10)

Unnamed: 0,article,political_affiliation
0,Adolf_Hitler,Right
1,Andrew_Dickson_White,Right
2,Ariel_Sharon,Right
3,Armia_Krajowa,Right
4,Ayaan_Hirsi_Ali,Right
5,Barack_Obama,Left
6,Benito_Mussolini,Right
7,"Benjamin_Disraeli,_1st_Earl_of_Beaconsfield",Center
8,Bill_Clinton,Left
9,Boyle_Roche,Center


In [21]:
# visualisation of the political affiliation distribution
affiliation_counts = {"Right": 0, "Left": 0, "Center": 0}

for affiliation in politicien['political_affiliation']:
    if affiliation in affiliation_counts:
        affiliation_counts[affiliation] += 1

tot_politicien = politicien.shape[0]

data_plot = pd.DataFrame({'Affiliation': list(affiliation_counts.keys()), 'Count': list(affiliation_counts.values())})
colors = {
    'Right': 'lightblue',
    'Left': 'lightcoral',
    'Center': 'gray'
}

fig = px.pie(data_plot, values='Count', names='Affiliation', 
             color='Affiliation',  
             color_discrete_map=colors,  
             title='Political Affiliation Distribution of the Political People',
             hole=0.4,  
             labels={'Affiliation': 'Political Class', 'Count': 'Number of People'},  
             template='plotly_white')
fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', pull=[0.02, 0.02, 0.02])
fig.update_traces(marker=dict(line=dict(color='gray', width=2))) 
fig.update_layout(
    margin=dict(t=50, b=50, l=50, r=50),
    hovermode='closest'
)
fig.update_layout(
    xaxis=dict(
        fixedrange=False
    ),
    yaxis=dict(
        fixedrange=False
    ),
    autosize=True,
    margin=dict(autoexpand=True),
)

fig.add_annotation(
    go.layout.Annotation(
        text=f"Total : {tot_politicien}",
        x=0.5,
        y=0.5,
        showarrow=False,
        font=dict(size=13)
    )
)
fig.show()
fig.write_html("./plots/repart_bias_pol_2a3_1.html")

2a3


In [22]:
fig = make_subplots(rows=1, cols=1, subplot_titles=('Difference in access between the two sets for the unfinished paths',))

# Vérification de l'existence des clés dans le DataFrame 'politicien'
valid_keys = [key for key in diff_items_unfin.keys() if key in politicien['article'].tolist()]

# Récupération des couleurs associées aux affiliations politiques des politiciens
colors = [politicien.loc[politicien['article'] == p, 'political_affiliation']
          .map({'Right': 'lightblue', 'Left': 'lightcoral', 'Center': 'lightgray'})
          .values[0] for p in valid_keys]

fig.add_trace(go.Bar(x=list(valid_keys), y=list(diff_items_unfin.values()), 
                     marker=dict(color=colors),
                     name='Difference in access',
                     hovertemplate='Difference in access: %{y}<extra></extra>'), row=1, col=1)


fig.update_layout(
    height=700,
    width=900,
    showlegend=False 
)

fig.update_xaxes(title_text='Pages', row=1, col=1)
fig.update_yaxes(title_text='Difference in number of accesses', row=1, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=1, col=1)


fig.show()

3b1


la meme pour les finished paths

In [23]:
fig = make_subplots(rows=1, cols=1, subplot_titles=('Difference in access between the two sets for the finished paths (log scale)',))

# Vérification de l'existence des clés dans le DataFrame 'politicien'
valid_keys = [key for key in diff_items_fin.keys() if key in politicien['article'].tolist()]

# Récupération des couleurs associées aux affiliations politiques des politiciens
colors = [politicien.loc[politicien['article'] == p, 'political_affiliation']
          .map({'Right': 'lightblue', 'Left': 'lightcoral', 'Center': 'lightgray'})
          .values[0] for p in valid_keys]

fig.add_trace(go.Bar(x=list(valid_keys), y=np.log(list(diff_items_fin.values())), 
                     marker=dict(color=colors),
                     name='Difference in access in a log scale',
                     hovertemplate='Difference in access: %{y}<extra></extra>'), row=1, col=1)


fig.update_layout(
    height=700,
    width=900,
    showlegend=False 
)

fig.update_xaxes(title_text='Pages', row=1, col=1)
fig.update_yaxes(title_text='Difference in number of accesses (log scale)', row=1, col=1)
fig.update_xaxes(tickangle=90, tickmode='array', row=1, col=1)


fig.show()
fig.write_html("./plots/repart_pol_path_3b1.html")

In [24]:
fig = make_subplots(rows=1, cols=1, subplot_titles=('Difference in access between the two sets for the finished paths (log scale)',))

# Vérification de l'existence des clés dans le DataFrame 'politicien'
valid_keys = [key for key in diff_items_fin.keys() if key in politicien['article'].tolist()]

# Récupération des couleurs associées aux affiliations politiques des politiciens
colors = [politicien.loc[politicien['article'] == p, 'political_affiliation']
          .map({'Right': 'lightblue', 'Left': 'lightcoral', 'Center': 'lightgray'})
          .values[0] for p in valid_keys]

fig.add_trace(go.Bar(y=list(valid_keys), x=np.log(list(diff_items_fin.values())), 
                     orientation='h',
                     marker=dict(color=colors),
                     name='Difference in access in a log scale',
                     hovertemplate='Difference in access: %{x}<extra></extra>'), row=1, col=1)

fig.update_layout(
    height=900,
    width=700,
    showlegend=False 
)

fig.update_yaxes(title_text='Pages', row=1, col=1)
fig.update_xaxes(title_text='Difference in number of accesses (log scale)', row=1, col=1)

fig.show()
fig.write_html("./plots/repart_pol_path_3b1.html")

## Analyses of those results

Assumption : The value of Adolf Hitler is significantly different from the rest of the personal figures, We will perform a T test to see if it's statistically significant or not. The null hypothesis is that there isn't any significant difference between Adolf and the other political figures. 

In [25]:
from scipy.stats import ttest_1samp
values = list(diff_items_fin.values())
val_hitler=values[0]
values = values[1:] #Removing the value we wanna test (first one)
t_statistic, p_value = ttest_1samp(values,val_hitler)
print(f'T-Statistic: {t_statistic}')
print(f'P-Value: {p_value}')

#Check P value
if p_value < 0.05:
    print('The average score of Adolf Hitler is significantly different from the group.')
else:
    print('There is no significant difference in the average score of Adolf Hitler.')

T-Statistic: -241.84873351530135
P-Value: 3.509049768286252e-62
The average score of Adolf Hitler is significantly different from the group.


From this result we can reject the null hypothesis and confirm that there is statistically a significant difference between Adolf Hitler and the other political figures. 

## Classification of all articles on the political bias present in the text 

This classification is explained in the README file and has been made in an other file. Here we are using the result of this Machine Learning prediction model.

In [26]:
bias.head(10)

Unnamed: 0,title,Bias
0,Áedán_mac_Gabráin,2
1,Åland,2
2,Édouard_Manet,2
3,Éire,2
4,Óengus_I_of_the_Picts,2
5,€2_commemorative_coins,2
6,10th_century,2
7,11th_century,2
8,12th_century,2
9,13th_century,2


In [27]:
bias["Bias"]= bias["Bias"].replace({0:"left", 1:"center", 2:"right"})

## Right- and Left-leaning articles on wikispeedia 

In [28]:
bias_counts = bias['Bias'].value_counts()
total_values = bias_counts.sum()


colors = {
    'right': 'lightblue',
    'left': 'lightcoral',
    'center': 'gray'
}

fig = px.pie(bias_counts, 
             labels=bias_counts.index, 
             values=bias_counts.values,
             title='Political Bias repartition',
             names=bias_counts.index, 
             hole=0.4, 
             template='plotly_white')
fig.update_traces(marker=dict(line=dict(color='gray', width=2))) 
fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', pull=[0.02] * len(bias_counts))
fig.update_layout(
    xaxis=dict(
        fixedrange=False
    ),
    yaxis=dict(
        fixedrange=False
    ),
    autosize=True,
    margin=dict(autoexpand=True),
)
fig.update_layout(
    margin=dict(t=50, b=50, l=50, r=50),
    hovermode='closest'
)
fig.add_annotation(
    go.layout.Annotation(
        text=f"Total : {total_values}",
        x=0.5,
        y=0.5,
        showarrow=False,
        font=dict(size=13)
    )
)

# Affecter les mêmes couleurs que dans le graphique précédent
fig.update_traces(marker=dict(colors=[colors[section] for section in bias_counts.index]))

fig.show()
fig.write_html("./plots/repart_bias_2a1.html")

rajouter zone de texte avec le nombre total article et modifier pour que ça ressemble au premier
rajouter le biais pour juste les political people --> 2a3

2.a.1

same but only for the politicien articles

In [29]:
merged_politicien = pd.merge(politicien, bias, how='left', left_on='article', right_on='title')
merged_politicien.drop(columns=['title'], inplace=True)


In [30]:
merged_politicien

Unnamed: 0,article,political_affiliation,Bias
0,Adolf_Hitler,Right,left
1,Andrew_Dickson_White,Right,right
2,Ariel_Sharon,Right,right
3,Armia_Krajowa,Right,right
4,Ayaan_Hirsi_Ali,Right,right
...,...,...,...
62,William_Howard_Taft,Right,left
63,William_Pitt_the_Younger,Right,center
64,Winston_Churchill,Right,left
65,Zachary_Taylor,Center,right


In [31]:
# visualisation of the political affiliation distribution
affiliation_counts = {"right": 0, "left": 0, "center": 0}

for affiliation in merged_politicien['Bias']:
    if affiliation in affiliation_counts:
        affiliation_counts[affiliation] += 1

tot_politicien_merged = merged_politicien.shape[0]

data_plot = pd.DataFrame({'Affiliation': list(affiliation_counts.keys()), 'Count': list(affiliation_counts.values())})
colors = {
    'right': 'lightblue',
    'left': 'lightcoral',
    'center': 'gray'
}

fig = px.pie(data_plot, values='Count', names='Affiliation', 
             color='Affiliation',  
             color_discrete_map=colors,  
             title='Bias of the article of the political people',
             hole=0.4,  
             labels={'Bias': 'Political Class', 'Count': 'Number of People'},  
             template='plotly_white')
fig.update_traces(hoverinfo='label+percent', textinfo='value+percent', pull=[0.02, 0.02, 0.02])
fig.update_traces(marker=dict(line=dict(color='gray', width=2))) 
fig.update_layout(
    margin=dict(t=50, b=50, l=50, r=50),
    hovermode='closest'
)
fig.update_layout(
    xaxis=dict(
        fixedrange=False
    ),
    yaxis=dict(
        fixedrange=False
    ),
    autosize=True,
    margin=dict(autoexpand=True),
)

fig.add_annotation(
    go.layout.Annotation(
        text=f"Total : {tot_politicien_merged}",
        x=0.5,
        y=0.5,
        showarrow=False,
        font=dict(size=13)
    )
)
fig.show()

now we do the heatmap

In [32]:
import plotly.figure_factory as ff
import pandas as pd

# Votre DataFrame fusionné : merged_politicien

# Créer une matrice pour le heatmap
matrix = [
    [((merged_politicien['Bias'] == 'right') & (merged_politicien['political_affiliation'] == 'Left')).sum(),
     ((merged_politicien['Bias'] == 'right') & (merged_politicien['political_affiliation'] == 'Center')).sum(),
     ((merged_politicien['Bias'] == 'right') & (merged_politicien['political_affiliation'] == 'Right')).sum()],
    [((merged_politicien['Bias'] == 'left') & (merged_politicien['political_affiliation'] == 'Left')).sum(),
     ((merged_politicien['Bias'] == 'left') & (merged_politicien['political_affiliation'] == 'Center')).sum(),
     ((merged_politicien['Bias'] == 'left') & (merged_politicien['political_affiliation'] == 'Right')).sum()],
    [((merged_politicien['Bias'] == 'center') & (merged_politicien['political_affiliation'] == 'Left')).sum(),
     ((merged_politicien['Bias'] == 'center') & (merged_politicien['political_affiliation'] == 'Center')).sum(),
     ((merged_politicien['Bias'] == 'center') & (merged_politicien['political_affiliation'] == 'Right')).sum()]
]

# Étiquettes pour les lignes et les colonnes
x_labels = ['Left', 'Center', 'Right']
y_labels = ['Right', 'Center', 'Left']

# Utiliser une échelle de couleurs prédéfinie (choisissez-en une parmi la liste)
colorscale = 'reds'  # Par exemple, 'ylgnbu' parmi les échelles de couleurs disponibles

# Créer le heatmap avec Plotly en utilisant l'échelle de couleurs spécifiée
fig = ff.create_annotated_heatmap(
    z=matrix,
    x=x_labels,
    y=y_labels,
    colorscale=colorscale  # Utiliser l'échelle de couleurs choisie
)

fig.update_layout(
    title='Heatmap: Bias vs Political Affiliation',
    xaxis=dict(title='Political Affiliation'),
    yaxis=dict(title='Bias'),
)

fig.show()
fig.write_html("./plots/repart_bias_pol_2a3_2.html")

In [33]:
df_categories = categories.copy()
df_categories.head(20)

Unnamed: 0,article,categories
0,Áedán_mac_Gabráin,subject.People.Historical_figures
1,Åland,subject.Countries
2,Åland,subject.Geography.European_Geography.European_...
3,Édouard_Manet,subject.People.Artists
4,Éire,subject.Countries
5,Éire,subject.Geography.European_Geography.European_...
6,Óengus_I_of_the_Picts,subject.History.British_History.British_Histor...
7,Óengus_I_of_the_Picts,subject.People.Historical_figures
8,€2_commemorative_coins,subject.Business_Studies.Currency
9,10th_century,subject.History.General_history


In [34]:
# one hot encoding for all of the categories
df_categories['categories'] = df_categories['categories'].str.replace('subject.', '')
encoded_categories = df_categories['categories'].str.get_dummies(sep='.')

# concatenating the article names with the encoded categories
categories_encoded = pd.concat([df_categories['article'], encoded_categories], axis=1)

# combination of all of the categories for each article
categories_combined = categories_encoded.groupby('article').max().reset_index()

In [35]:
#Merging the categories_combined df with the political bias df 

all_articles_with_bias_and_categories = pd.merge(categories_combined, bias, left_on = 'article', right_on = 'title', how = 'left')
all_articles_with_bias_and_categories = all_articles_with_bias_and_categories.drop('title', axis = 1)

In [36]:
#Visualisation of proportions for each category

df = all_articles_with_bias_and_categories.copy()

right_df = df[df['Bias'] == 'right'].drop(columns=['article', 'Bias'])
right_df = right_df.sum()
right_df.name = 'right'
left_df = df[df['Bias'] == 'left'].drop(columns=['article', 'Bias'])
left_df = left_df.sum()
left_df.name ='left'
center_df = df[df['Bias'] == 'center'].drop(columns=['article', 'Bias'])
center_df = center_df.sum()
center_df.name = 'center'

plot_df = pd.merge(right_df, left_df, left_index=True, right_index=True, how='inner')
plot_df = pd.merge(plot_df, center_df, left_index=True, right_index=True, how='inner')
plot_df.head()

total_counts = plot_df.sum(axis=1)
proportions_df = plot_df.divide(total_counts, axis=0)

plot_df.describe()

Unnamed: 0,right,left,center
count,141.0,141.0,141.0
mean,66.460993,5.134752,9.51773
std,129.014978,13.883814,22.128715
min,3.0,0.0,0.0
25%,17.0,0.0,1.0
50%,29.0,1.0,3.0
75%,61.0,3.0,8.0
max,982.0,116.0,193.0


## Political analysis by article type

In [37]:
# modification of the df for plotly
melted_df = proportions_df.reset_index().melt(id_vars='index', var_name='Bias', value_name='Proportion')
melted_df.columns = ['Category', 'Bias', 'Proportion']

colors = {
    'right': 'lightblue',
    'left': 'lightcoral',
    'center': 'gray'
}

fig = px.bar(melted_df, x='Category', y='Proportion', color='Bias', barmode='stack',
             title='Stacked Histogram of Proportions',  color_discrete_map=colors)
fig.update_layout(
    xaxis=dict(title='Category'),
    yaxis=dict(title='Proportion'),
    legend=dict(title='Bias', font=dict(size=15)),
    font=dict(size=12),
    height=900,
    width=900,
    showlegend=True
)



fig.show()
fig.write_html("./plots/repart_bias_categ_2a2.html")

2a2

In [38]:
index_left = proportions_df.sort_values(by='left', ascending=False).index[:6]
fig_left = make_subplots(rows=3, cols=2, subplot_titles=index_left,
                         specs=[[{'type':'domain'}]*2]*3)
fig_left.update_layout(title='Categories with biggest proportion of left-biased articles', width=900, height=900)
for i, category in enumerate(index_left):
    pie = go.Pie(labels=plot_df.columns, values=plot_df.loc[category], hole=0.4,
                 hovertemplate='%{label}: %{percent:.1%}',
                 insidetextorientation='horizontal', textinfo='none', hoverinfo='label')
    fig_left.add_trace(pie, row=(i // 2) + 1, col=(i % 2) + 1)
fig_left.show()


index_right = proportions_df.sort_values(by='right', ascending=False).index[:6]
fig_right = make_subplots(rows=3, cols=2, subplot_titles=index_right,
                         specs=[[{'type':'domain'}]*2]*3)
fig_right.update_layout(title='Categories with biggest proportion of right-biased articles', width=900, height=900)
for i, category in enumerate(index_right):
    pie = go.Pie(labels=plot_df.columns, values=plot_df.loc[category], hole=0.4,
                 hovertemplate='%{label}: %{percent:.1%}',
                 insidetextorientation='horizontal', textinfo='none', hoverinfo='label')
    fig_right.add_trace(pie, row=(i // 2) + 1, col=(i % 2) + 1)
fig_right.show()


index_center = proportions_df.sort_values(by='center', ascending=False).index[:6]
fig_center = make_subplots(rows=3, cols=2, subplot_titles=index_center,
                         specs=[[{'type':'domain'}]*2]*3)
fig_center.update_layout(title='Categories with biggest proportion of center-biased articles', width=900, height=900)
for i, category in enumerate(index_center):
    pie = go.Pie(labels=plot_df.columns, values=plot_df.loc[category], hole=0.4,
                 hovertemplate='%{label}: %{percent:.1%}',
                 insidetextorientation='horizontal', textinfo='none', hoverinfo='label')
    fig_center.add_trace(pie, row=(i // 2) + 1, col=(i % 2) + 1)
fig_center.show()

## Analysis of results

In [39]:
#analyse the results of general analysis wikispeedia (between political figures and political bias)

## Analyse un/finished path with political bias

In [40]:
finished_path_with_bias = finished_path.copy()
unfinished_path_with_bias = unfinished_path.copy()

# Define a mapping from categorical bias to numerical values
bias_mapping = {
    'left': -1,
    'center': 0,
    'right': 1
}

# Function to convert bias labels to numbers, calculate averages, and then convert back to categorical bias
def calculate_path_biases(path, all_articles_df):
    articles = path.split(';')  # Adjust delimiter if needed
    biases = all_articles_df[all_articles_df['article'].isin(articles)]['Bias']
    
    # Map the categorical biases to numerical values
    numerical_biases = biases.map(bias_mapping)
    
    # Calculate the average numerical bias
    average_numerical_bias = numerical_biases.mean()
    # Get the bias of the start article, assuming the first article in the list
    start_numerical_bias = numerical_biases.iloc[0] if len(numerical_biases) > 0 else None
    #Get the bias of the end article, assuming the last article in the list
    end_numerical_bias = numerical_biases.iloc[-1] if len(numerical_biases) > 0 else None

    # Convert the average numerical bias back to a categorical bias
    average_bias_label = numerical_to_categorical_bias(average_numerical_bias)
    
    # Convert the start numerical bias back to a categorical bias if necessary
    start_bias_label = numerical_to_categorical_bias(start_numerical_bias) if start_numerical_bias is not None else None
    #Convert the end numerical bias back to a categorical bias if necessary
    end_bias_label = numerical_to_categorical_bias(end_numerical_bias) if end_numerical_bias is not None else None

    return average_numerical_bias, average_bias_label, start_bias_label, end_bias_label

# Helper function to convert numerical bias back to categorical
def numerical_to_categorical_bias(numerical_bias):
    if numerical_bias < -1/3:
        return 'left'
    elif -1/3 <= numerical_bias <= 1/3:
        return 'center'
    else:
        return 'right'

# Apply the function finished path
finished_path_with_bias[['average_numerical_bias','average_bias_label', 'start_bias_label', 'end_bias_label']] = \
    finished_path_with_bias['path'].apply(lambda x: pd.Series(calculate_path_biases(x, all_articles_with_bias_and_categories)))

# Apply the function unfinished path
unfinished_path_with_bias[['average_numerical_bias', 'average_bias_label', 'start_bias_label', 'end_bias_label']] = \
    unfinished_path_with_bias['path'].apply(lambda x: pd.Series(calculate_path_biases(x, all_articles_with_bias_and_categories)))

finished_path_with_bias.head()
unfinished_path_with_bias.head()

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,path_goal,rating,average_numerical_bias,average_bias_label,start_bias_label,end_bias_label
0,26141fd878806294,2011-02-07 05:14:11,1805,Julius_Caesar,Caracas,timeout,1.0,right,right,right
1,2b015fb8181c48f2,2011-02-07 15:00:19,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout,1.0,right,right,right
2,53a53bc244e08a6a,2011-02-07 16:06:01,49,Paraguay,Mount_St._Helens,restart,1.0,right,right,right
3,53a53bc244e08a6a,2011-02-07 17:18:25,1808,Paraguay;Bolivia,Mount_St._Helens,timeout,1.0,right,right,right
4,131600803df4895e,2011-02-07 17:42:37,2009,Agriculture;History_of_the_world;China;Yangtze...,Grand_Canal_of_China,timeout,0.75,right,right,right


In [41]:
print(finished_path_with_bias['average_bias_label'].value_counts())
print(finished_path_with_bias['start_bias_label'].value_counts())

average_bias_label
right     49199
center     2087
left         32
Name: count, dtype: int64
start_bias_label
right     45801
center     3292
left       2224
Name: count, dtype: int64


In [42]:

# Function to create histograms with colored bias zones
def create_colored_histogram(df, title):
    # Initialize figure
    fig = go.Figure()

    # Define colors for bias zones
    colors = {
        'left': '#FF9AA2',  # Pastel red
        'center': '#B5B5B5',  # Pastel grey
        'right': '#A3D8F4'  # Pastel greenish-blue
    }

    # Add histogram for each bias zone with specific color
    for label, color in colors.items():
        # Filter the dataframe for the specific bias zone
        zone_df = df[(df['average_numerical_bias'] > -1/3 if label != 'left' else df['average_numerical_bias'] <= -1/3) &
                     (df['average_numerical_bias'] <= 1/3 if label != 'right' else df['average_numerical_bias'] > 1/3)]
        # Add trace for the zone
        fig.add_trace(go.Histogram(x=zone_df['average_numerical_bias'], name=label.capitalize(),
                                   marker_color=color, nbinsx=30, opacity=0.6))

    # Add vertical lines to demarcate the bias zones
    fig.add_vline(x=-1/3, line_width=2, line_dash="dash", line_color="grey")
    fig.add_vline(x=1/3, line_width=2, line_dash="dash", line_color="grey")

    # Update layout
    fig.update_layout(title=title, xaxis_title='Average Numerical Bias', yaxis_title='Count',
                      barmode='stack')

    # Add annotations for start and end counts in each zone
    for label, color in colors.items():
        start_count = df[df['start_bias_label'] == label].shape[0]
        end_count = df[df['end_bias_label'] == label].shape[0]
        # Place annotation in the middle top of the graph in their respective zone
        zone_center = {'left': 0.2, 'center': 0.5, 'right': 0.8}[label]
        fig.add_annotation(x=zone_center, y=1.02, text=f"Start: {start_count}<br>End: {end_count}",
                        showarrow=False, xref="paper", yref="paper", bgcolor=color, font=dict(color='black'),
                        align="center")
    return fig

# Create the colored histogram for finished paths
fig_finished_colored = create_colored_histogram(finished_path_with_bias, 'Finished Paths: Average Numerical Bias with Colored Bias Zones')

# Create the colored histogram for unfinished paths
fig_unfinished_colored = create_colored_histogram(unfinished_path_with_bias, 'Unfinished Paths: Average Numerical Bias with Colored Bias Zones')

# Show the plots
fig_finished_colored.show()
fig_unfinished_colored.show()


In [43]:
# Combine the dataframes with a new column to distinguish between them
finished_path_with_bias['finished'] = 1
unfinished_path_with_bias['finished'] = 0
all_paths_with_bias = pd.concat([finished_path_with_bias, unfinished_path_with_bias])

# Create the box plot
fig = go.Figure()

# Add traces for finished and unfinished paths
fig.add_trace(go.Box(y=all_paths_with_bias[all_paths_with_bias['finished'] == 1]['average_numerical_bias'], name='Finished',marker_color='lightseagreen'))

fig.add_trace(go.Box(y=all_paths_with_bias[all_paths_with_bias['finished'] == 0]['average_numerical_bias'], name='Unfinished',marker_color='coral'))

# Define colors for the bias zones
colors = {
    'left': '#FF9AA2',  # Pastel red
    'center': '#B5B5B5',  # Pastel grey
    'right': '#A3D8F4'   # Pastel blue
}

# Add shapes for background colors
fig.update_layout(
    shapes=[
        # Left Bias Zone
        dict(type="rect",x0=0,y0=-1/3,x1=1,y1=-1.25,xref='paper',yref='y',fillcolor=colors['left'],opacity=0.3,layer="below",line_width=0,),
        # Center Bias Zone
        dict(type="rect", x0=0,y0=-1/3, x1=1, y1=1/3, xref='paper', yref='y',fillcolor=colors['center'],opacity=0.3,layer="below",line_width=0,),
        # Right Bias Zone
        dict(type="rect",x0=0,y0=1/3,x1=1,y1=1.25,xref='paper',yref='y',fillcolor=colors['right'],opacity=0.3,layer="below",line_width=0,),
    ]
)

# Update layout
fig.update_layout(
    title='Comparison of Average Numerical Bias in Finished vs. Unfinished Paths',
    yaxis_title='Average Numerical Bias',
    boxmode='group'
)

# Show the plot
fig.show()


# Prepare data
# Clean the 'durationInSec' column for finished paths, converting to numeric and dropping NaN values
finished_path_with_bias['durationInSec'] = pd.to_numeric(finished_path_with_bias['durationInSec'], errors='coerce')
# Drop any rows that contain NaN values in the 'durationInSec' or 'average_numerical_bias' columns
finished_path_with_bias = finished_path_with_bias.dropna(subset=['durationInSec', 'average_numerical_bias', 'rating'])
finished_path_with_bias = finished_path_with_bias.replace([np.inf, -np.inf], np.nan).dropna(subset=['durationInSec', 'average_numerical_bias', 'rating'])

# Drop any rows that contain NaN values in the 'average_numerical_bias' column
all_paths_with_bias = all_paths_with_bias.dropna(subset=['average_numerical_bias', 'durationInSec', 'rating'])
# Check for any infinite values and handle them
all_paths_with_bias = all_paths_with_bias.replace([np.inf, -np.inf], np.nan).dropna(subset=['average_numerical_bias', 'durationInSec', 'rating'])

# Logistic Regression with OLS
X_logistic = all_paths_with_bias[['average_numerical_bias']]  # Predictor
X_logistic = sm.add_constant(X_logistic)  # Adds a constant term to the predictor
y_logistic = all_paths_with_bias['finished']  # Response

model_logistic = sm.Logit(y_logistic, X_logistic).fit()
print(model_logistic.summary())

Optimization terminated successfully.
         Current function value: 0.690490
         Iterations 3
                           Logit Regression Results                           
Dep. Variable:               finished   No. Observations:                53179
Model:                          Logit   Df Residuals:                    53177
Method:                           MLE   Df Model:                            1
Date:                Mon, 18 Dec 2023   Pseudo R-squ.:               0.0001066
Time:                        19:32:17   Log-Likelihood:                -36720.
converged:                       True   LL-Null:                       -36723.
Covariance Type:            nonrobust   LLR p-value:                  0.005142
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0.0705      0.028      2.549      0.011       0.016      

In [44]:

# Calculate the mean and standard deviation
mean_duration = all_paths_with_bias['durationInSec'].mean()
std_duration = all_paths_with_bias['durationInSec'].std()

# Set a threshold for outliers (e.g., 3 standard deviations from the mean)
threshold = 3
lower_limit = mean_duration - (threshold * std_duration)
upper_limit = mean_duration + (threshold * std_duration)

# Filter out outliers
filtered_data = all_paths_with_bias[(all_paths_with_bias['durationInSec'] <= upper_limit)]


# Define colors for the bias zones
colors = {
    'left': '#FF9AA2',  # Pastel red
    'center': '#B5B5B5',  # Pastel grey
    'right': '#A3D8F4'   # Pastel blue
}

# Define a function to assign color based on bias value
def get_color(value):
    if value < -1/3:
        return colors['left']
    elif value > 1/3:
        return colors['right']
    else:
        return colors['center']

# Create a histogram with mean duration per bias bin
fig = go.Figure()

# Define bins for 'average_numerical_bias'
bins = np.linspace(-1, 1, 20)
filtered_data['bias_bin'] = pd.cut(filtered_data['average_numerical_bias'], bins)

# Calculate the mean duration for each bin
mean_duration_per_bin = filtered_data.groupby('bias_bin')['durationInSec'].mean().reset_index()
mean_duration_per_bin['bias_bin_mid'] = mean_duration_per_bin['bias_bin'].apply(lambda x: x.mid)

# Add bars to the figure
for i, row in mean_duration_per_bin.iterrows():
    fig.add_trace(go.Bar(
        x=[row['bias_bin_mid']],
        y=[row['durationInSec']],
        name=str(row['bias_bin']),
        marker_color=get_color(row['bias_bin_mid'])
    ))

# Create a line plot to overlay
line_fig = go.Figure(data=go.Scatter(
    x=mean_duration_per_bin['bias_bin_mid'], 
    y=mean_duration_per_bin['durationInSec'], 
    mode='lines',
    name='Trend Line'
))

# Combine the histogram and line plot
fig.add_trace(line_fig.data[0])

# Update layout
fig.update_layout(
    xaxis=dict(title='Average Numerical Bias', range=[-1, 1]),
    yaxis_title='Mean Duration (Sec)',
    barmode='group',
    showlegend=False
)

# Show the plot
fig.show()



# Linear Regression with OLS on finished paths
X_linear = finished_path_with_bias[['average_numerical_bias']]  # Predictor
X_linear = sm.add_constant(X_linear)  # Adds a constant term to the predictor
y_linear = finished_path_with_bias['durationInSec']  # Response

model_linear = sm.OLS(y_linear, X_linear).fit()
print(model_linear.summary())

                            OLS Regression Results                            
Dep. Variable:          durationInSec   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.5086
Date:                Mon, 18 Dec 2023   Prob (F-statistic):              0.476
Time:                        19:32:17   Log-Likelihood:            -2.0241e+05
No. Observations:               28500   AIC:                         4.048e+05
Df Residuals:                   28498   BIC:                         4.048e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                    153

In [79]:
#Spider Plot 
# Convert 'durationInSec' to float
filtered_data_spider = filtered_data.copy()
filtered_data_spider['durationInSec'] = filtered_data['durationInSec'].astype(float)

# Ensure 'finished' is numeric
filtered_data_spider['finished'] = pd.to_numeric(filtered_data['finished'], errors='coerce')

# Function to count back clicks
def count_back_clicks(path):
    return path.count('<') if isinstance(path, str) else 0

# List to store the metrics for each bias label
metrics_list = []

# Calculate the metrics for each average_bias_label
for label in ['right', 'left', 'center']:
    # Filter the data for the current label
    data_label = filtered_data_spider[filtered_data_spider['average_bias_label'] == label]

    # Exclude rows where 'rating' is 'timeout' or 'restart'
    data_label_valid_ratings = data_label[~data_label['rating'].isin(['timeout', 'restart'])]

    # Convert 'rating' to float after excluding invalid ratings
    data_label_valid_ratings['rating'] = data_label_valid_ratings['rating'].astype(float)

    # Calculate the metrics
    percentage_finished = data_label['finished'].mean() * 100  # Mean will give the proportion, multiply by 100 for percentage
    
    # Total paths are the sum of all semicolons + 1 for each path (since the last article won't have a semicolon)
    total_paths = data_label['path'].str.count(';').sum() + len(data_label)
    total_back_clicks = data_label['path'].apply(count_back_clicks).sum()
    percentage_back_clicks = (total_back_clicks / total_paths) * 100

    mean_rating = data_label_valid_ratings['rating'].mean()
    mean_time = data_label['durationInSec'].mean()

    # Add the metrics to the list
    metrics_list.append({
        'average_bias_label': label,
        'percentage_finished': percentage_finished,
        'percentage_back_clicks': percentage_back_clicks,
        'mean_rating': mean_rating,
        'mean_time': mean_time
    })

# Create a DataFrame from the list
metrics_df = pd.DataFrame(metrics_list)

metrics_df


Unnamed: 0,average_bias_label,percentage_finished,percentage_back_clicks,mean_rating,mean_time
0,right,54.680267,7.552235,2.28674,457.382678
1,left,3.787879,1.405622,1.2,672.252525
2,center,43.389199,5.287897,1.857511,497.970205


In [80]:

# Initialize subplots
fig = make_subplots(rows=1, cols=len(metrics_df.columns) - 1, shared_xaxes=True)

# Column names excluding 'average_bias_label'
metric_columns = metrics_df.columns[1:]

# Add a bar chart to each subplot
for i, metric in enumerate(metric_columns, start=1):
    for _, row in metrics_df.iterrows():
        fig.add_trace(
            go.Bar(name=row['average_bias_label'], x=[metric], y=[row[metric]], 
                   marker_color=['lightblue', 'lightcoral ', 'lightgray'][['right', 'left', 'center'].index(row['average_bias_label'])]),
            row=1, col=i
        )


# Update y-axes to have a bit more than the maximum value
for i, metric in enumerate(metric_columns, start=1):
    max_val = metrics_df[metric].max()
    fig.update_yaxes(title_text=metric, row=1, col=i, range=[0, max_val + max_val * 0.1])  # 10% more than the max

# Update layout
fig.update_layout(
    title_text="Metrics Comparison by Bias Label",
    barmode='group'
)

# Show the plot
fig.show()


In [90]:


# Initialize the radar chart
fig = go.Figure()

# Define the categories for the radar chart
categories = list(metrics_df.columns[1:])  # Exclude the label column

# Find the maximum value for each metric to set the axis limits
axis_limits = [metrics_df[metric].max() * 1.1 for metric in categories]  # 10% more than the max

# Define the colors for the labels
label_colors = {
    'right': 'lightblue',
    'left': 'lightcoral',
    'center': 'lightgray'
}

# Add a trace for each bias label in the order of 'right', 'center', 'left'
bias_labels_order = ['right', 'center', 'left']
for bias_label in bias_labels_order:
    for index, row in metrics_df.iterrows():
        if row['average_bias_label'] == bias_label:
            label_color = label_colors.get(row['average_bias_label'], 'lightgray')

            fig.add_trace(go.Scatterpolar(
                r=[row[metric] / axis_limits[i] for i, metric in enumerate(categories)],
                theta=categories,
                fill='toself',
                name=row['average_bias_label'],
                line=dict(color=label_color),  # Set label color
            ))


# Update the layout of the radar chart
fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1]
        )
    ),
    title="Scaled Metrics by Bias Label"
)

# Show the plot
fig.show()
fig.write_html("./plots/repart_bias_araigne_3a.html")

In [91]:

# First, create a dictionary mapping article titles to their political biases
title_to_bias = pd.Series(all_articles_with_bias_and_categories.Bias.values, index=all_articles_with_bias_and_categories.article).to_dict()

# Function to convert a path to a list of biases
def path_to_biases(path):
    articles = path.split(';')  # Split the path into individual article titles
    biases = [title_to_bias.get(article.strip(), 'Unknown') for article in articles]  # Map each title to its bias, stripping whitespace
    return biases

# Apply this function to each path and explode the resulting lists into a single series
biases_series = filtered_data['path'].apply(path_to_biases).explode()

# Count the occurrences of each political bias
bias_counts = biases_series.value_counts()

# Create the pie chart with Plotly Express
fig = px.pie(
    names=bias_counts.index,
    values=bias_counts.values,
    title='Distribution of Articles by Political Bias',
    color_discrete_map={
        'right': 'lightblue',
        'left': 'lightcoral',
        'center': 'lightgrey',
        'Unknown': 'silver'  
    }
)

fig.show()



## Analysis Between bias and backclicks

### Finished paths


In [49]:
#Columns needed : Path, Article, Bias --> 
#The article before the < is the one they rewent on after backclick

In [50]:
def extract_page_before_backclick(path):
    words=[]
    if '<' in path:
        while '<' in path:
            index = path.index('<')
            
            words.append(path[index-1])
            path = path[:index]+ path[index+1:] #Remove the backclick 

        return words
    else:
        return ["None"]

In [51]:
paths=all_paths.copy()
paths["backclicks"]=paths.apply(extract_page_before_backclick)


In [52]:
flat_data = [word for sublist in paths['backclicks'] for word in sublist]
back_counts = pd.Series(flat_data).value_counts().reset_index()
back_counts.columns = ["title", "Num_back"]
back_counts=back_counts.drop(0)
back_counts["title"]=back_counts["title"].apply(lambda x :(urllib.parse.unquote(x , encoding = 'utf-8')))
print("The total of backclicks in the finished paths is : %i times"%(back_counts["Num_back"].sum()))
display(back_counts.head(10))

The total of backclicks in the finished paths is : 20561 times


Unnamed: 0,title,Num_back
1,United_States,228
2,United_Kingdom,193
3,England,179
4,Animal,151
5,Europe,116
6,France,113
7,North_America,109
8,World_War_II,105
9,Great_Britain,104
10,Chemical_element,104


In [53]:
back_counts_final= pd.merge(back_counts, bias[["title", "Bias"]], how="left", on="title")
#display(back_counts_final)

In [54]:
back_counts_final["Bias"].value_counts() #Number of articles of each category

Bias
right     2073
center     249
left       124
Name: count, dtype: int64

In [55]:
back_counts_final=back_counts_final.groupby('Bias').agg({'Num_back': 'sum'}).reset_index() #we can change "sum" to "mean" or "median" but doesn't change results
back_counts_final

Unnamed: 0,Bias,Num_back
0,center,1660
1,left,620
2,right,18281


In [56]:
bias_counts = back_counts_final

fig = px.pie(bias_counts, 
             labels=bias_counts["Bias"], 
             values=bias_counts["Num_back"], 
             title='Political Bias repartition for backclicks in finished paths',
             names=bias_counts["Bias"], 
             hole=0.4)
fig.update_traces(textposition='inside', textinfo='percent+label', pull=[0.005]*len(bias_counts))
fig.update_layout(showlegend=True)
fig.update_layout(margin=dict(t=50, b=50, l=50, r=50))
fig.show()

So here we can see the number of backclicks on articles depending on the political bias. This represent the pages that the user went back on after selecting a another one. Remember the percentage of right classified articles in wikispeedia is 82%. The first, naive observation that we can make is : We can therefore assume that the political bias of the articles does not seem to have an impact on the way users play.  --> more analyse ?

### Unfinished paths


In [57]:
#same code as above
paths=all_paths_unfinished.copy()
paths["backclicks"]=paths.apply(extract_page_before_backclick)
flat_data = [word for sublist in paths['backclicks'] for word in sublist]
back_counts = pd.Series(flat_data).value_counts().reset_index()
back_counts.columns = ["title", "Num_back"]
back_counts=back_counts.drop(0)
back_counts["title"]=back_counts["title"].apply(lambda x :(urllib.parse.unquote(x , encoding = 'utf-8')))
print("The total of backclicks in the unfinished paths is : %i times"%(back_counts["Num_back"].sum()))
display(back_counts.head(10))
back_counts_final= pd.merge(back_counts, bias[["title", "Bias"]], how="left", on="title")
#display(back_counts_final)


The total of backclicks in the unfinished paths is : 12907 times


Unnamed: 0,title,Num_back
1,United_States,194
2,United_Kingdom,114
3,Mammal,113
4,England,95
5,Animal,94
6,Sport,78
7,Herbivore,70
8,Africa,70
9,Vertebrate,68
10,Film,67


In [58]:
back_counts_final["Bias"].value_counts() #Number of articles of each category

Bias
right     1771
center     198
left        92
Name: count, dtype: int64

In [59]:
back_counts_final=back_counts_final.groupby('Bias').agg({'Num_back': 'sum'}).reset_index() #we can change "sum" to "mean" or "median" but doesn't change results
back_counts_final

Unnamed: 0,Bias,Num_back
0,center,1005
1,left,354
2,right,11548


In [60]:
bias_counts = back_counts_final

fig = px.pie(bias_counts, 
             labels=bias_counts["Bias"], 
             values=bias_counts["Num_back"], 
             title='Political Bias repartition for backclicks in unfinished paths',
             names=bias_counts["Bias"], 
             hole=0.4)
fig.update_traces(textposition='inside', textinfo='percent+label', pull=[0.005]*len(bias_counts))
fig.update_layout(showlegend=True)
fig.update_layout(margin=dict(t=50, b=50, l=50, r=50))
fig.show()

Approximatively the same results between finished and unfinished paths. No apparent distinction between un/finished paths. There is 20561-12907= 7654 backclicks more in the finished paths but also more data.

## Analysis of biases through the paths 

In [61]:
article_with_bias = bias.copy()
paths_copy = all_paths.copy()

# Decode the titles and remove backlicks from the paths 
paths_copy = paths_copy.apply(lambda x : [urllib.parse.unquote(item , encoding = 'utf-8') for item in x if item != '<'])


In [62]:
def check_condition(first_bias, next_bias, matrix,n):
    # Function to map the transition from different biases in pathes to a matrix in this form : 

    # [ RR RL RC
    #   LR LL LC
    #   CR CL CC] 
    # Where R means right biased, L left bias and C means no bias
    # For example the position [0,2] is RC meaning the proportion of steps in paths going from Right-biased to not biased articles

    if first_bias == 'right':
        i =0
    elif first_bias == 'left':
        i =1
    elif first_bias == 'center':
        i = 2

    if next_bias== 'right':
        j =0
    elif next_bias == 'left':
        j =1
    elif next_bias == 'center':
        j= 2

    

    matrix[n-1,i,j] += 1 
    
    return matrix

In [63]:
# Create a dictionary for quick access
article_bias_dict = dict(zip(article_with_bias['title'], article_with_bias['Bias']))

N = 8
n_plus_1_matrix = np.zeros((N, 3, 3))

for path_number, path in enumerate(paths_copy):
    #print(path_number)
    article_number_in_path = len(path)

    for n in range(N):
        for art_number in range(article_number_in_path - n):
            first_article = path[art_number]
            next_article = path[art_number + n]

            first_bias = article_bias_dict.get(first_article, None)
            next_bias = article_bias_dict.get(next_article, None)


            if first_bias is not None and next_bias is not None:
                n_plus_1_matrix = check_condition(first_bias, next_bias, n_plus_1_matrix, n)

#print(n_plus_1_matrix)



In [64]:
# [ RR RL RC
#   LR LL LC
#   CR CL CC] 


norm_matr = np.zeros(np.shape(n_plus_1_matrix)) # Allocate variable

for i in range(8): 
    # Normalize each row of the matrices so the sum accounts to 1
    norm_matr[i]= n_plus_1_matrix[i]/n_plus_1_matrix[i].sum(axis=1)[:, np.newaxis]



# Step in path leading to right-biased next article
to_right = norm_matr[:, :, 0]
r_r = to_right[:-1, 0]
l_r = to_right[:-1, 1]
c_r = to_right[:-1, 2]

# leading to left-biased
to_left = norm_matr[:, :, 1]
r_l = to_left[:-1, 0]
l_l = to_left[:-1, 1]
c_l = to_left[:-1, 2]

# leading to not biased
to_center = norm_matr[:, :, 2]
r_c = to_center[:-1, 0]
l_c = to_center[:-1, 1]
c_c = to_center[:-1, 2]

colors = {
    'left': '#FF9AA2',  # Pastel red
    'center': '#B5B5B5',  # Pastel grey
    'right': '#A3D8F4'   # Pastel blue
}

#################
# Make plot 
x = [x for x in range(1,7)]

fig = make_subplots(rows=1, cols=3, subplot_titles=['To Right', 'To Left', 'To Center'])

# First subplot
fig.add_trace(go.Scatter(x=x, y=r_r, mode='lines+markers', name='right', line=dict(color=colors['right'])), row=1, col=1)
fig.add_trace(go.Scatter(x=x, y=l_r, mode='lines+markers', name='left', line=dict(color=colors['left'])), row=1, col=1)
fig.add_trace(go.Scatter(x=x, y=c_r, mode='lines+markers', name='center', line=dict(color=colors['center'])), row=1, col=1)
# horizontal line for better visualization 
fig.add_trace(go.Scatter( x = [1,6] , y = [0.822, 0.822], mode='lines',    line=dict(color=colors['right']), 
                            name='Expected frequency', showlegend=True))

fig.update_xaxes(title_text='N + x', row=1, col=1)
fig.update_yaxes(title_text='Probability', row=1, col=1)

# Second subplot
fig.add_trace(go.Scatter(x=x, y=r_l, mode='lines+markers', name='right' , showlegend=False,   line=dict(color=colors['right'])), row=1, col=2)
fig.add_trace(go.Scatter(x=x, y=l_l, mode='lines+markers', name= 'left', showlegend=False,  line=dict(color=colors['left'])), row=1, col=2)
fig.add_trace(go.Scatter(x=x, y=c_l, mode='lines+markers', name= 'center', showlegend=False,   line=dict(color=colors['center'])), row=1, col=2)
# horizontal line for better visualization 
fig.add_trace(go.Scatter( x = [1,6] , y = [0.116, 0.116], mode='lines', line=dict(color=colors['left']), 
                            name='Expected frequency', showlegend=True), row=1, col=2)
fig.update_xaxes(title_text='N + x', row=1, col=2)




# Third subplot
fig.add_trace(go.Scatter(x=x, y=r_c, mode='lines+markers', name='right', showlegend=False,  line=dict(color=colors['right'])), row=1, col=3)
fig.add_trace(go.Scatter(x=x, y=l_c, mode='lines+markers', name='left',  showlegend=False, line=dict(color=colors['left'])), row=1, col=3)
fig.add_trace(go.Scatter(x=x, y=c_c, mode='lines+markers', name='center', showlegend=False,  line=dict(color=colors['center'])), row=1, col=3)
# horizontal line for better visualization 
fig.add_trace(go.Scatter( x = [1,6] , y = [0.06, 0.06], mode='lines',    line=dict(color=colors['center']),
                            name='Expected frequency', showlegend=True), row=1, col=3)
fig.update_xaxes(title_text='N + x', row=1, col=3)


fig.update_layout(title_text='', showlegend=True, legend=dict(orientation="v"))
fig.show()
fig.write_html("./plots/bias_n_3b3.html")

<strong> PROPOSITION 2 : DIFFERENCE AVEC EXPECTED FREQUENCY (jpense c'est moins bien) :


In [65]:

right_prop = 0.822
left_prop = 0.16
center_prop = 0.062
norm_matr = np.zeros(np.shape(n_plus_1_matrix)) # Allocate variable

for i in range(8): 
    # Normalize each row of the matrices so the sum accounts to 1
    norm_matr[i]= n_plus_1_matrix[i]/n_plus_1_matrix[i].sum(axis=1)[:, np.newaxis]



# Step in path leading to right-biased next article
to_right = norm_matr[:, :, 0] - right_prop
r_r = to_right[:-1, 0]
l_r = to_right[:-1, 1]
c_r = to_right[:-1, 2]

# leading to left-biased
to_left = norm_matr[:, :, 1] - left_prop
r_l = to_left[:-1, 0]
l_l = to_left[:-1, 1]
c_l = to_left[:-1, 2]

# leading to not biased
to_center = norm_matr[:, :, 2] - center_prop
r_c = to_center[:-1, 0]
l_c = to_center[:-1, 1]
c_c = to_center[:-1, 2]

colors = {
    'left': '#FF9AA2',  # Pastel red
    'center': '#B5B5B5',  # Pastel grey
    'right': '#A3D8F4'   # Pastel blue
}

#################
# Make plot 
x = [x for x in range(1,7)]

fig = make_subplots(rows=1, cols=3, subplot_titles=['To Right', 'To Left', 'To Center'])

# First subplot
fig.add_trace(go.Scatter(x=x, y=r_r, mode='lines+markers', name='right', line=dict(color=colors['right'])), row=1, col=1)
fig.add_trace(go.Scatter(x=x, y=l_r, mode='lines+markers', name='left', line=dict(color=colors['left'])), row=1, col=1)
fig.add_trace(go.Scatter(x=x, y=c_r, mode='lines+markers', name='center', line=dict(color=colors['center'])), row=1, col=1)
#fig.add_trace(go.Scatter( x = [1,6] , y = [0.822, 0.822], mode='lines',    line=dict(color='#A3D8F4'), name='Expected frequency', showlegend=True))

fig.update_xaxes(title_text='N + x', row=1, col=1)
fig.update_yaxes(title_text='Probability', row=1, col=1)

# Second subplot
fig.add_trace(go.Scatter(x=x, y=r_l, mode='lines+markers', name='right' , showlegend=False,   line=dict(color=colors['right'])), row=1, col=2)
fig.add_trace(go.Scatter(x=x, y=l_l, mode='lines+markers', name= 'left', showlegend=False,  line=dict(color=colors['left'])), row=1, col=2)
fig.add_trace(go.Scatter(x=x, y=c_l, mode='lines+markers', name= 'center', showlegend=False,   line=dict(color=colors['center'])), row=1, col=2)
#fig.add_trace(go.Scatter( x = [1,6] , y = [0.116, 0.116], mode='lines', line=dict(color=colors['left']), 
                            #name='Expected frequency', showlegend=True), row=1, col=2)
fig.update_xaxes(title_text='N + x', row=1, col=2)




# Third subplot
fig.add_trace(go.Scatter(x=x, y=r_c, mode='lines+markers', name='right', showlegend=False,  line=dict(color=colors['right'])), row=1, col=3)
fig.add_trace(go.Scatter(x=x, y=l_c, mode='lines+markers', name='left',  showlegend=False, line=dict(color=colors['left'])), row=1, col=3)
fig.add_trace(go.Scatter(x=x, y=c_c, mode='lines+markers', name='center', showlegend=False,  line=dict(color=colors['center'])), row=1, col=3)
#fig.add_trace(go.Scatter( x = [1,6] , y = [0.06, 0.06], mode='lines',    line=dict(color=colors['center']),
                            #name='Expected frequency', showlegend=True), row=1, col=3)
fig.update_xaxes(title_text='N + x', row=1, col=3)


fig.update_layout(title_text='', showlegend=True, legend=dict(orientation="v"))
fig.show()
