# Related subjects
## 1st step: subject frequency

In this step we're going to analyse the most frequent subjects in the links of a People page

### Preprocessing

In [2]:
import pandas as pd
import numpy as np
from urllib.parse import unquote
import matplotlib.pyplot as plt

In [3]:
# Download the datasets 
folder_path = "dataset/wikispeedia_paths-and-graph/"
file_paths = ["paths_finished.tsv", "paths_unfinished.tsv", "categories.tsv", "articles.tsv", "links.tsv"]
data_frames_names = ["paths_finished", "paths_unfinished", "categories", "article", "links"]
dfs_headers = [
    ["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"],
    ["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"],
    ["article", "category"],
    ["article"],
    ["linkSource", "linkTarget"]
]
dfs_skiprows = [16, 17, 13, 12, 12]
dfs = {}

for i in range(len(file_paths)):
    dfs[data_frames_names[i]] = pd.read_csv(folder_path + file_paths[i], sep='\t', header=None, names=dfs_headers[i], skiprows=range(dfs_skiprows[i]))

In [4]:
categories = dfs["categories"]
links = dfs["links"]

In [5]:
def change_characters(dataframe, dataset_name, column_name):
    dataframe[dataset_name][column_name] = [unquote(art) for art in dataframe[dataset_name][column_name]]
    return dataframe

In [6]:
dfs = change_characters(dfs, 'categories', 'article')
dfs = change_characters(dfs, 'categories', 'category')
dfs = change_characters(dfs, 'links', 'linkSource')
dfs = change_characters(dfs, 'links', 'linkTarget')


In [7]:
#first copy the data_frame
reduced_categories = categories.copy(deep=True)

#extract the reduced categories and add them to the copied df
reduced_categories_list = [category.replace('subject.', '').split('.')[0] for category in reduced_categories['category']]
reduced_categories = reduced_categories.assign(simple_category = reduced_categories_list)


### Links going from People

In [8]:
#reduce the rows of the lnks dataframe by creating a list of outgoing links
links_from_article = links.groupby('linkSource')['linkTarget'].agg(list).reset_index()
links_from_article.columns = ['article', 'links']
links_from_article
#merge it with reduced_categories to get a fuller dataframe
links_from_article = pd.merge(reduced_categories, links_from_article, on='article', how='inner')
links_from_article


Unnamed: 0,article,category,simple_category,links
0,Áedán_mac_Gabráin,subject.History.British_History.British_Histor...,History,"[Bede, Columba, Dál_Riata, Great_Britain, Irel..."
1,Áedán_mac_Gabráin,subject.People.Historical_figures,People,"[Bede, Columba, Dál_Riata, Great_Britain, Irel..."
2,Åland,subject.Countries,Countries,"[20th_century, Baltic_Sea, Crimean_War, Curren..."
3,Åland,subject.Geography.European_Geography.European_...,Geography,"[20th_century, Baltic_Sea, Crimean_War, Curren..."
4,Édouard_Manet,subject.People.Artists,People,"[Absinthe, Beer, Claude_Monet, Diego_Velázquez..."
...,...,...,...,...
5185,Zirconium,subject.Science.Chemistry.Chemical_elements,Science,"[Aluminium, Arabic_language, Australia, Bicycl..."
5186,Zoroaster,subject.People.Religious_figures_and_leaders,People,"[18th_century, 9th_century, Afghanistan, Age_o..."
5187,Zuid-Gelders,subject.Geography.European_Geography,Geography,"[Brabantian, Dutch_language, East_Flemish, Hol..."
5188,Zuid-Gelders,subject.Language_and_literature.Languages,Language_and_literature,"[Brabantian, Dutch_language, East_Flemish, Hol..."


In [9]:
def find_top_categories(article_list):
    
    article_categories = reduced_categories[reduced_categories['article'].isin(article_list)]['simple_category']
    top_categories = article_categories.value_counts().nlargest(3).index.tolist()
    top_categories += [None] * (3 - len(top_categories))
    return pd.Series(top_categories)


In [10]:
outgoing_subjects = links_from_article.copy(deep=True)
outgoing_subjects[['top_category1', 'top_category2', 'top_category3']] = links_from_article['links'].apply(find_top_categories)
outgoing_subjects.drop(columns=['links'], inplace=True)
outgoing_subjects

Unnamed: 0,article,category,simple_category,top_category1,top_category2,top_category3
0,Áedán_mac_Gabráin,subject.History.British_History.British_Histor...,History,Geography,History,Citizenship
1,Áedán_mac_Gabráin,subject.People.Historical_figures,People,Geography,History,Citizenship
2,Åland,subject.Countries,Countries,Geography,Citizenship,Countries
3,Åland,subject.Geography.European_Geography.European_...,Geography,Geography,Citizenship,Countries
4,Édouard_Manet,subject.People.Artists,People,Geography,People,Countries
...,...,...,...,...,...,...
5185,Zirconium,subject.Science.Chemistry.Chemical_elements,Science,Science,Geography,Countries
5186,Zoroaster,subject.People.Religious_figures_and_leaders,People,People,Geography,Countries
5187,Zuid-Gelders,subject.Geography.European_Geography,Geography,Language_and_literature,Geography,
5188,Zuid-Gelders,subject.Language_and_literature.Languages,Language_and_literature,Language_and_literature,Geography,


In [11]:
subjects_from_people = outgoing_subjects[outgoing_subjects.simple_category == 'People'].copy(deep=True)
subjects_from_people.drop(columns=['simple_category'], inplace=True)
subjects_from_people.value_counts('top_category1', normalize=True)

top_category1
Geography                  0.631350
People                     0.127721
History                    0.085631
Science                    0.065312
Everyday_life              0.020319
Religion                   0.020319
Language_and_literature    0.018868
Citizenship                0.010160
Countries                  0.008708
IT                         0.005806
Mathematics                0.002903
Music                      0.002903
Name: proportion, dtype: float64

We can see that with this classification, we get that 63% of people that are related to Geography, which is very unlikely.

### Links going to People
Now we can do the same thing with only articles leading to a People page.

In [12]:
#reduce the rows of the lnks dataframe by creating a list of outgoing links
links_to_article = links.groupby('linkTarget')['linkSource'].agg(list).reset_index()
links_to_article = links_to_article.rename(columns={"linkTarget": "article", "linkSource": "source"}).copy(deep=True)
#merge it with reduced_categories to get a fuller dataframe
links_to_article = pd.merge(reduced_categories, links_to_article, on='article', how='inner')
links_to_article


Unnamed: 0,article,category,simple_category,source
0,10th_century,subject.History.General_history,History,"[11th_century, 12th_century, 15th_century, 16t..."
1,11th_century,subject.History.General_history,History,"[10th_century, 12th_century, 15th_century, 16t..."
2,12th_century,subject.History.General_history,History,"[11th_century, 13th_century, 15th_century, 16t..."
3,13th_century,subject.History.General_history,History,"[11th_century, 12th_century, 14th_century, 15t..."
4,14th_century,subject.History.General_history,History,"[11th_century, 12th_century, 13th_century, 15t..."
...,...,...,...,...
4712,Zirconium,subject.Science.Chemistry.Chemical_elements,Science,"[Calcium, Cerium, Diamond_simulant, Gas_metal_..."
4713,Zoroaster,subject.People.Religious_figures_and_leaders,People,"[Friedrich_Nietzsche, Greco-Buddhism, Iran, Ir..."
4714,Zuid-Gelders,subject.Geography.European_Geography,Geography,"[Brabantian, Dutch_language, East_Flemish, Hol..."
4715,Zuid-Gelders,subject.Language_and_literature.Languages,Language_and_literature,"[Brabantian, Dutch_language, East_Flemish, Hol..."


In [13]:
ingoing_subjects = links_to_article.copy(deep=True)
ingoing_subjects[['top_category1', 'top_category2', 'top_category3']] = links_to_article['source'].apply(find_top_categories)
ingoing_subjects.drop(columns=['source'], inplace=True)
ingoing_subjects

Unnamed: 0,article,category,simple_category,top_category1,top_category2,top_category3
0,10th_century,subject.History.General_history,History,History,Geography,Countries
1,11th_century,subject.History.General_history,History,History,Geography,Countries
2,12th_century,subject.History.General_history,History,History,Geography,Countries
3,13th_century,subject.History.General_history,History,History,Geography,Religion
4,14th_century,subject.History.General_history,History,History,Geography,Countries
...,...,...,...,...,...,...
4712,Zirconium,subject.Science.Chemistry.Chemical_elements,Science,Science,Design_and_Technology,
4713,Zoroaster,subject.People.Religious_figures_and_leaders,People,People,Religion,Geography
4714,Zuid-Gelders,subject.Geography.European_Geography,Geography,Language_and_literature,Geography,
4715,Zuid-Gelders,subject.Language_and_literature.Languages,Language_and_literature,Language_and_literature,Geography,


In [14]:
subjects_to_people = ingoing_subjects[ingoing_subjects.simple_category == 'People'].copy(deep=True)
subjects_to_people.drop(columns=['simple_category'], inplace=True)
subjects_to_people.value_counts('top_category1', normalize=True)

top_category1
People                     0.368760
History                    0.222222
Geography                  0.099839
Science                    0.072464
Everyday_life              0.043478
Language_and_literature    0.038647
Religion                   0.038647
Music                      0.025765
Art                        0.024155
Citizenship                0.022544
IT                         0.016103
Design_and_Technology      0.011272
Mathematics                0.008052
Countries                  0.004831
Business_Studies           0.003221
Name: proportion, dtype: float64

This representation seem way more convicing as the distribution is more even. 

In [15]:
subjects_to_people

Unnamed: 0,article,category,top_category1,top_category2,top_category3
76,Abel_Tasman,subject.People.Geographers_and_explorers,Geography,Countries,People
85,Abraham_Lincoln,subject.People.USA_Presidents,People,Geography,History
107,Adam_Smith,subject.People.Historical_figures,Citizenship,Business_Studies,People
111,Adolf_Hitler,subject.People.Political_People,History,People,Geography
131,Agamemnon,subject.People.Historical_figures,History,Religion,People
...,...,...,...,...,...
4701,Zhang_Qian,subject.People.Historical_figures,History,Geography,
4703,Zheng_He,subject.People.Historical_figures,Geography,History,Countries
4704,Ziad_Jarrah,subject.People.Historical_figures,History,,
4710,Zionism,subject.People.Political_People,People,Geography,History


In [16]:
perofrmers = subjects_to_people[subjects_to_people.category == "subject.People.Producers_directors_and_media_figures"].copy(deep=True)

In [17]:
perofrmers

Unnamed: 0,article,category,top_category1,top_category2,top_category3
145,Akira_Kurosawa,subject.People.Producers_directors_and_media_f...,Everyday_life,,
171,Alfred_Hitchcock,subject.People.Producers_directors_and_media_f...,People,Everyday_life,Design_and_Technology
334,Arnold_Schwarzenegger,subject.People.Producers_directors_and_media_f...,People,Everyday_life,History
1171,David_Attenborough,subject.People.Producers_directors_and_media_f...,Geography,People,Countries
4065,Sydney_Newman,subject.People.Producers_directors_and_media_f...,Everyday_life,,
4508,Walt_Disney,subject.People.Producers_directors_and_media_f...,People,Everyday_life,Geography


### TF-IDF

In [18]:
links_to_people  = ingoing_subjects[ingoing_subjects.simple_category == 'People'].copy(deep=True)

In [19]:
links_to_people

Unnamed: 0,article,category,simple_category,top_category1,top_category2,top_category3
76,Abel_Tasman,subject.People.Geographers_and_explorers,People,Geography,Countries,People
85,Abraham_Lincoln,subject.People.USA_Presidents,People,People,Geography,History
107,Adam_Smith,subject.People.Historical_figures,People,Citizenship,Business_Studies,People
111,Adolf_Hitler,subject.People.Political_People,People,History,People,Geography
131,Agamemnon,subject.People.Historical_figures,People,History,Religion,People
...,...,...,...,...,...,...
4701,Zhang_Qian,subject.People.Historical_figures,People,History,Geography,
4703,Zheng_He,subject.People.Historical_figures,People,Geography,History,Countries
4704,Ziad_Jarrah,subject.People.Historical_figures,People,History,,
4710,Zionism,subject.People.Political_People,People,People,Geography,History


In [42]:
# Create the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()