In [None]:
#necessary imports
import numpy as np
import matplotlib.pyplot as plt

## NDLM'ADA - Look up ! Stars and how we them
##### *Timo Achard, Romane Clerc, Louise Font, Julie Korber, Emeric Martin*

# Table of contents
* [Data Explanation](#section1)
    * [General pre processing](#sub_section_1_1)
    * [Data repartition](#sub_section_1_2)
        * [Article categories](#sub_section_1_2_1)
        * [People categories](#sub_section_1_2_2)
    * [Gender retrieval](#sub_section_1_3)
        * [Gender repartition in categories](#sub_section_1_3_1)
    * [Graph connection and article representation](#sub_section_1_4)

* [Page Rank](#section2)
    * [General results](#sub_section_2_1)
    * [PageRank and gender](#sub_section_2_2)
* [Links analysis](#section3)
    * [Last links analysis](#sub_section_3_1)
* [Influence of link position](#section4)
    * [Distribution of position](#sub_section_4_1)
    * [Distribution influence](#sub_section_4_2)
* [Influence of genre](#section5)
    * [Unbalanced data](#sub_section_5_1)
    * [Birth data explanation](#sub_section_5_2)


## 1. Data Explanation <a classe="anchor" id="Section1"></a>

### 1.1 General pre processing <a id="sub_section_1_1"></a>

In [None]:
# IMPORTS

import pandas as pd
import matplotlib.pyplot as plt
from urllib.parse import unquote
import numpy as np
import plotly.express as px

# To Extract all the URLs from the HTM page
from bs4 import BeautifulSoup
import os

import networkx as nx
from datetime import datetime

# Draw the graph
import seaborn as sns

from helpers import (
    extract_links,
    change_characters,
    path_to_name,
    get_gender_for_name,
    add_all_genders
    )   

# Colors definition for beautiful graphs

color_male = 'red'
color_female = 'green'
color_unknown = 'grey'

colors_categories = ["#F3C300",
                      "#875692",
                      "#F38400",
                      "#A1CAF1",
                      "#BE0032",
                      "#C2B280",
                      "#848482",
                      "#008856",
                      "#E68FAC",
                      "#0067A5",
                      "#F99379",
                      "#604E97",
                      "#F6A600",
                      "#B3446C",
                      "#DCD300",
                      "#882D17",
                      "#8DB600",
                      "#654522",
                      "#E25822",
                      "#2B3D26"] # <- Tried to have colorblind-friendly colors (unsure about having 20 different colors, even for non-colorblind people...)

### Downloading the dataset

In [None]:
# Paths
folder_path = "dataset/wikispeedia_paths-and-graph/"
file_paths = ["paths_finished.tsv", "paths_unfinished.tsv", "categories.tsv", "articles.tsv", "links.tsv"]

# Datasets: Names and their columns name
data_frames_names = ["paths_finished", "paths_unfinished", "categories", "article", "links"]
dfs_headers = [
    ["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"],
    ["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"],
    ["article", "category"],
    ["article"],
    ["linkSource", "linkTarget"]
]

# Download
dfs_skiprows = [16, 17, 13, 12, 12]
dfs = {}
for i in range(len(file_paths)):
    dfs[data_frames_names[i]] = pd.read_csv(folder_path + file_paths[i], sep='\t', header=None, names=dfs_headers[i], skiprows=range(dfs_skiprows[i]))

In [None]:
# renaming and adjustments

dfs = change_characters(dfs, 'paths_finished', 'path')
dfs['paths_finished'] = dfs['paths_finished'].drop(['hashedIpAddress', 'rating'], axis = 1)
dfs = change_characters(dfs, 'paths_unfinished', 'path')
dfs = change_characters(dfs, 'paths_unfinished', 'target')
dfs = change_characters(dfs, 'categories', 'article')
dfs = change_characters(dfs, 'categories', 'category')
categories = dfs['categories']
dfs = change_characters(dfs, 'article', 'article')
dfs = change_characters(dfs, 'links', 'linkSource')
dfs = change_characters(dfs, 'links', 'linkTarget')

### 1.2 Data repartition <a id="sub_section_1_2"></a>

#### 1.2.1 Article categories <a id="subsection_1_2_1"></a>

In [None]:
# First copy the data_frame
reduced_categories = categories.copy(deep=True)

# Extract the reduced categories and add them to the copied df
reduced_categories_list = [category.replace('subject.', '').split('.')[0] for category in reduced_categories['category']]
reduced_categories = reduced_categories.assign(first_category = reduced_categories_list)

# Extract second most important category
reduced_categories_list_2 = [category.replace('subject.', '').split('.')[1] if len(category.replace('subject.', '').split('.')) > 1 else None for category in reduced_categories['category']]
reduced_categories = reduced_categories.assign(second_category = reduced_categories_list_2)

# Count the new distribution and plot it
first_category_distribution = reduced_categories['first_category'].value_counts()
second_category_distribution = reduced_categories['second_category'].value_counts()
reduced_categories = reduced_categories.drop(columns =['article', 'category']).drop_duplicates()

merged = pd.merge(second_category_distribution, reduced_categories, on = 'second_category')
#display(merged.head())

fig = px.bar(merged, x="first_category", y="count", color="second_category", title="Categories represented in our dataset")
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()
fig.write_html("docs/overall_dataset_bar.html")

#### 1.2.2 People categories <a id="subsection_1_2_2"></a>

##### Extracting people data

In [None]:

# Selection of all the articles from a people category
all_people_selection = categories[categories['category'].str.contains('People')]

# Separate the data as people that are main or secondary people
main_people_selection = all_people_selection[all_people_selection['category'].str.contains('subject.People')]
second_people_selection = pd.concat([main_people_selection,all_people_selection]).drop_duplicates(keep=False)

# Verify that all the articles are unique
all_people_selection.info() 
main_people_selection.info()
second_people_selection.info()

In [None]:
# Count for each people category
people_categories = pd.DataFrame(main_people_selection.value_counts('category'))
people_categories.index = [cat.replace('subject.People.', '') for cat in people_categories.index]
people_categories['category'] = people_categories.index

# Do a beautiful plot
fig = px.bar_polar(people_categories, r="count", theta="category", color= colors_categories)
fig.update_layout(showlegend=False)
fig.show()
#Export html for the website
fig.write_html("docs/people_categories.html")



### 1.3 Gender retrieval <a id="sub_section_1_3"></a>

In [None]:
main_people_selection = add_all_genders(main_people_selection)

#### 1.3.1 Gender repartition in categories <a id="subsection_1_3_1"></a>

### 1.4 Graph connection and article representation <a id="sub_section_1_4"></a>

## 2 Page Rank <a id="section2"></a>

### 2.1 General results <a id="sub_section_2_1"></a>

### 2.2 Page Rank and gender <a id="sub_section_2_2"></a>


## 3 Links analysis <a id="section3"></a>


### 3.1 Last links analysis <a id="sub_section_3_1"></a>


## 4 Influence of link position <a id="section4">


### 4.1 Distribution of position <a id="sub_section_4_1"></a>


### 4.2 Distribution of influence <a id="sub_section_4_2"></a>


## 5 Influence of genre <a id="section5"></a>


### 5.1 Unbalanced data <a id="sub_section_5_1"></a>


### 5.2 Birth of date explanation <a id="sub_section_5_2"></a>