In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
df_continents = pd.read_csv(
    os.path.join("Data", "continents.csv"), index_col=0
)

display(df_continents)

### Colors for all plots

In [None]:
continents = df_continents['continent'].unique()
random_colors = sns.color_palette('husl', n_colors=len(continents))
continents_colors = {}
for i in range(len(continents)):
    continents_colors[continents[i]] = random_colors[i]
print(continents_colors)

### Articles per continent

In [None]:
continents_grouped = df_continents.groupby(['continent']).size()
continents_grouped = continents_grouped.sort_values(ascending=False)
continents_grouped

In [None]:
num_categories = len(continents_grouped.keys())
random_colors = sns.color_palette('husl', n_colors=num_categories)
ax = plt.bar(continents_grouped.keys(), continents_grouped.values, color = [continents_colors[continent] for continent in continents_grouped.keys()])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Count')
plt.title('Count of Categories by Continent')

## Number of articles by category

In [None]:
categories = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "categories.tsv"),
    delimiter="\t",
    header=None,
    names=["article", "category"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

categories.head()

In [None]:
articles = pd.merge(categories, df_continents, on="article", how="inner")
articles = articles.fillna("")
articles.head()

In [None]:
main_cats = []
for category in articles['category'].values:
    main_cats.append(category.split('.')[1])

articles['main_category'] = main_cats
display(articles)

In [None]:
articles_cat_continent = articles.groupby(['main_category','continent']).size()
display(articles_cat_continent[('Geography','International')])

I have to check this plot

In [None]:
articles_cat_continent

In [None]:
categories = articles['main_category'].unique()
continents = articles['continent'].unique()
continents.sort()

#In case we do not want international
mask = (continents != 'International')
continents = continents[mask]

category_positions = np.arange(len(categories))
bar_width = 0.5

continents_values = {}
for i, continent in enumerate(continents):
    frequencies = []
    for category in categories:
        try:
            value = articles_cat_continent[(category,continent)]
        except KeyError:
            value = 0
        frequencies.append(value)
    continents_values[continent] = frequencies

df_continent_frequencies = pd.DataFrame(continents_values).T
df_continent_frequencies.columns = categories
display(df_continent_frequencies)

fig, ax = plt.subplots()

bottom = np.zeros(len(categories))

for continent in continents:
    ax.barh(categories, continents_values[continent], label=continent, color=continents_colors[continent], edgecolor='w', height=0.5, left=bottom)
    bottom += continents_values[continent]

ax.set_yticks(category_positions)
ax.set_yticklabels(categories)
ax.set_xlabel('Frequency')
ax.set_ylabel('Category')
plt.title('Frequency of Continents in Each Category')

# Display legend
plt.legend()

plt.show()


## Length of articles

In [None]:
articles_df = pd.read_csv("Data/wikispeedia_paths-and-graph/articles.tsv", sep="\t",header=None, names=["article"], skiprows=11)
articles_df.head()

In [None]:
# Compute the length and create a new dataframe called analysis_df to be used during the initial analysis

analysis_df = articles_df.copy()

# Specify the path to the folder containing the .txt files
plaintext_path = 'data/plaintext_articles'

# Create an empty list to store the results
word_counts = []

# Iterate through each article in the articles.tsv file
for index, row in analysis_df.iterrows():
    # Construct the full path to the .txt file
    file_path = os.path.join(plaintext_path, row['article'] + '.txt')

    # Read the contents of the .txt file
    with open(file_path, 'r', encoding='utf-8') as file:

        _ = file.readline() # Skip the first line because it contains the word #copyright
        content = file.read()

    # Count the number of words in the article
    word_count = len(content.split())

    # Append the result to the list
    word_counts.append(word_count)

# Add a new column 'WordCount' to the analysis_df
analysis_df['length'] = word_counts

In [None]:
display(analysis_df.head())
print(analysis_df.shape)

In [None]:
# Merge analysis with continents
analysis_df = pd.merge(analysis_df, df_continents, on=["article"])
analysis_df.head()

In [None]:
#Compute the average length of article for each continent
continent_length = analysis_df.groupby(['continent']).mean('length')
continent_length=continent_length.sort_values('length',ascending=False)
continent_length

In [None]:
ax = plt.bar(continent_length.index,continent_length.values.reshape(8) , color = [continents_colors[continent] for continent in continent_length.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Length')
plt.title('Average length of articles by continent')

## Pagerank

In [None]:
# Load pagerank
pagerank_df = pd.read_csv("Data/pagerank.csv")

In [None]:
display(pagerank_df.head())
print(pagerank_df.shape)
print("Note that the pageRank has less rows!!")


In [None]:
analysis_df = pd.merge(analysis_df, pagerank_df, left_on=["article"], right_on=["Articles"], how="left").fillna(0)
analysis_df = analysis_df.drop(["Articles"], axis=1)

In [None]:
display(analysis_df.head())
analysis_df.shape

In [None]:
# Compute mean
mean_pagerank_continent = analysis_df.groupby("continent").mean("PageRank")
mean_pagerank_continent =mean_pagerank_continent.sort_values(by='PageRank', ascending = False)

In [None]:
ax = plt.bar(mean_pagerank_continent.index,mean_pagerank_continent['PageRank'] , color = [continents_colors[continent] for continent in mean_pagerank_continent.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Pagerank')
plt.title('Average PageRank by continent')

## Analysis of paths

Group paths by continents, one path will be assigned to the continent corresponding to the GOAL article

Compute the number of "backclicks" in each path

Compute the length of each path

In [None]:
paths_finished_df = pd.read_csv("Data/wikispeedia_paths-and-graph/paths_finished.tsv", sep="\t", header=None, names = ["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"], skiprows=16)
paths_unfinished_df = pd.read_csv("Data/wikispeedia_paths-and-graph/paths_unfinished.tsv", sep="\t", header=None, names = ["hashedIpAddress","timestamp","durationInSec", "unf_path", "target","motif"],skiprows=17)

# Extract the target article from finished paths
paths_finished_df["target"] = paths_finished_df['path'].apply(lambda x: x.split(';')[-1])

# Count the number of backclicks
paths_finished_df['backclicks'] = paths_finished_df['path'].apply(lambda x: x.count('<'))
paths_unfinished_df['backclicks'] = paths_unfinished_df['unf_path'].apply(lambda x: x.count('<'))

# Compute the length of each path
paths_finished_df['path_steps'] = paths_finished_df['path'].apply(lambda x: x.count(';') + 1 + x.count('<'))
paths_unfinished_df['path_steps'] = paths_unfinished_df['unf_path'].apply(lambda x: x.count(';') + 1 + x.count('<'))

In [None]:
# Merge the paths with the continents

analysis_fin_paths = pd.merge(paths_finished_df, df_continents, left_on="target", right_on="article").drop("article", axis=1)
analysis_unf_paths = pd.merge(paths_unfinished_df, df_continents, left_on="target", right_on="article").drop("article", axis=1)

In [None]:
display(analysis_fin_paths.head())
print(analysis_fin_paths.shape)

display(analysis_unf_paths.head())
print(analysis_unf_paths.shape)

In [None]:
continent_data = pd.DataFrame()

continent_data["av_pagerank"] = mean_pagerank_continent['PageRank']

# Create columns for av. backclicks
continent_data["av_fin_backcliks"] =  analysis_fin_paths.groupby("continent")["backclicks"].mean()
continent_data["av_unf_backcliks"] =  analysis_unf_paths.groupby("continent")["backclicks"].mean()

# Create columns for av. path steps
continent_data["av_fin_steps"] =  analysis_fin_paths.groupby("continent")["path_steps"].mean()
continent_data["av_unf_steps"] =  analysis_unf_paths.groupby("continent")["path_steps"].mean()

# Create columns for the number of finished and unfinished paths for each continent
continent_data["fin_paths"] = analysis_fin_paths["continent"].value_counts()
continent_data["unf_paths"] = analysis_unf_paths["continent"].value_counts()
continent_data['per_fin_paths'] = (continent_data["fin_paths"]/(continent_data["fin_paths"]+continent_data["unf_paths"]))*100

In [None]:
continent_data=continent_data.sort_values(by='per_fin_paths',ascending=False)
display(continent_data)
print(continent_data.shape)

In [None]:
ax = plt.bar(continent_data.index,continent_data['per_fin_paths'] , color = [continents_colors[continent] for continent in continent_data.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Percentage')
plt.title('Percentage of finished paths by continent')

In [None]:
total_paths = len(paths_finished_df) + len(paths_unfinished_df)

In [None]:
# Create columns to count the number of times each article appears as goal in finished and unfinished paths
analysis_df["n_as_goal_in_fin"] = analysis_df["article"].map(analysis_fin_paths["target"].value_counts()).fillna(0)
analysis_df["n_as_goal_in_unf"] = analysis_df["article"].map(analysis_unf_paths["target"].value_counts()).fillna(0)

# Create columns to count the number of times each article appears in general in finished and unfinished paths
analysis_df["n_in_fin_paths"] = analysis_df["article"].map(pd.Series(';'.join(analysis_fin_paths['path']).split(';')).value_counts()).fillna(0)
analysis_df["n_in_unf_paths"] = analysis_df["article"].map(pd.Series(';'.join(analysis_unf_paths['unf_path']).split(';')).value_counts()).fillna(0)

#Let's calculate the probability of finding an article (we can compare this with the pagerank)
analysis_df["prob_finding"] = (analysis_df["n_in_fin_paths"] + analysis_df["n_in_unf_paths"])/total_paths

In [None]:
display(analysis_df)
print(analysis_df.shape)

In [None]:
# To do: Some plots
# Show the frequency of the articles