# Extraction of topics from Wikipedia pages

In [65]:
# https://github.com/taynaud/python-louvain

import IPython
import numpy as np
import networkx as nx
import requests
import pandas as pd
import csv
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import time
import community
from fonctions import *


pd.set_option('mode.chained_assignment', None)

path = 'Results/EN_20181216_20181231/'

region = "fr"

In [2]:
### Importing the nodes as a DataFrame for easier manipulations ###

# dataFrame = dataFrame = pd.read_csv(path + 'nodes.csv', encoding='utf-8')

graph = nx.read_graphml(path+'graph.graphml')
dataFrame = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index')
dataFrame['Id'] = dataFrame.index
dataFrame.rename(columns={'label': 'Label'}, inplace=True)
dataFrame['Label'] = dataFrame['Label'].astype('string')
dataFrame['Id'] = dataFrame['Id'].astype('int64')
# dataFrame.drop(columns=['size', 'r', 'g', 'b', 'x', 'y'], inplace=True)

degree = dict(nx.degree(graph, nbunch=None, weight=10))
partition = community.best_partition(graph, randomize=True, weight='Weight', resolution=1.5)
btw_cent = nx.betweenness_centrality(graph,normalized=False)
                           

dataFrame['Degree'] = pd.DataFrame.from_dict(degree, orient='index')[0]
dataFrame['modularity_class'] = pd.DataFrame.from_dict(partition, orient='index')[0]
dataFrame['betweenesscentrality'] = pd.DataFrame.from_dict(btw_cent, orient='index')[0]

add_graph_attribute(graph, dataFrame, 'Degree')
add_graph_attribute(graph, dataFrame, 'modularity_class')
add_graph_attribute(graph, dataFrame, 'betweenesscentrality')

dataFrame.sort_values(by = ['modularity_class'], inplace=True, ascending = [True])
dataFrame.sort_values(by = ['Id'], inplace=True, ascending = [True])
dataFrame.to_csv(path + 'nodes.csv', encoding='utf-8')

print(dataFrame.info())
display(dataFrame)

print("Number of clusters: ", dataFrame['modularity_class'].max())

<class 'pandas.core.frame.DataFrame'>
Index: 4834 entries, 61 to 12697556
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Label                 4834 non-null   string 
 1   size                  4834 non-null   float64
 2   r                     4834 non-null   int64  
 3   g                     4834 non-null   int64  
 4   b                     4834 non-null   int64  
 5   x                     4834 non-null   float64
 6   y                     4834 non-null   float64
 7   Id                    4834 non-null   int64  
 8   Degree                4834 non-null   int64  
 9   modularity_class      4834 non-null   int64  
 10  betweenesscentrality  4834 non-null   float64
dtypes: float64(4), int64(6), string(1)
memory usage: 453.2+ KB
None


Unnamed: 0,Label,size,r,g,b,x,y,Id,Degree,modularity_class,betweenesscentrality
61,Advanced_Micro_Devices,12.686567,192,192,192,9.234199,-16.520315,61,3,7,0.000000
103,Aton,20.746270,0,134,135,-10.841246,0.702035,103,9,13,2101.806936
206,Alfred_Nobel,11.343284,114,121,10,22.848211,24.862890,206,2,16,22046.317749
316,Lampe_à_incandescence_halogène,10.000000,114,121,10,-14.543655,-12.765390,316,1,16,0.000000
341,Amenhotep_III,31.492537,0,134,135,-10.848696,-14.589257,341,17,13,5114.899646
...,...,...,...,...,...,...,...,...,...,...,...
11983100,Thierry_Coste,23.432835,193,77,88,6.475411,-18.242767,11983100,11,8,46361.497269
11986417,Ugo_Humbert,14.029851,192,192,192,-8.752216,9.236148,11986417,4,7,0.000000
12470864,Équarrissage,10.000000,235,44,170,6.380493,19.105633,12470864,1,31,0.000000
12480619,Saison_9_de_Clem,12.686567,185,97,239,7.033388,-18.969147,12480619,3,4,0.000000


Number of clusters:  40


In [3]:
### Extract nodes labels from Wikipedia queries ###



# URL for the quieries
urls = "https://"+region+".wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&pageids="


i=0
Nb_pages = len(dataFrame)


### Associate the 'Qid' value of Wikipedia pages in the DataFrame ###
### Give '-1' value if an error has occured during the query      ###

for pageId in np.int64(dataFrame['Id']):
    response = requests.get(urls + str(pageId)).json()
    try:
        Qid = (list(findkeys(response, 'wikibase_item'))[0])
        Title = (list(findkeys(response, 'title'))[0])
    except IndexError:
        dataFrame.loc[dataFrame['Id'] == pageId, 'Qid'] = '-1'
        i+=1
        print(i,"/", Nb_pages, dataFrame.loc[dataFrame['Id'] == pageId, 'Label'], "\t error")
    else:
        dataFrame.loc[dataFrame['Id'] == pageId, 'Qid'] = Qid
        dataFrame.loc[dataFrame['Id'] == pageId, 'Label'] = Title
        i+=1
        print(i,"/", Nb_pages, dataFrame.loc[dataFrame['Id'] == pageId, 'Label'].values[0])

# Clear the loading display
IPython.display.clear_output(wait=False)

        
# Save the DataFrame with their associated Qids
dataFrame.to_csv(path + 'pages_Qids.csv', encoding='utf-8')

dataFrame

Unnamed: 0,Label,size,r,g,b,x,y,Id,Degree,modularity_class,betweenesscentrality,Qid
61,Advanced Micro Devices,12.686567,192,192,192,9.234199,-16.520315,61,3,7,0.000000,Q128896
103,Aton,20.746270,0,134,135,-10.841246,0.702035,103,9,13,2101.806936,Q130227
206,Alfred Nobel,11.343284,114,121,10,22.848211,24.862890,206,2,16,22046.317749,Q23810
316,Lampe à incandescence halogène,10.000000,114,121,10,-14.543655,-12.765390,316,1,16,0.000000,Q1114190
341,Amenhotep III,31.492537,0,134,135,-10.848696,-14.589257,341,17,13,5114.899646,Q42606
...,...,...,...,...,...,...,...,...,...,...,...,...
11983100,Thierry Coste,23.432835,193,77,88,6.475411,-18.242767,11983100,11,8,46361.497269,Q56280447
11986417,Ugo Humbert,14.029851,192,192,192,-8.752216,9.236148,11986417,4,7,0.000000,Q54007306
12470864,Équarrissage,10.000000,235,44,170,6.380493,19.105633,12470864,1,31,0.000000,Q62649386
12480619,Saison 9 de Clem,12.686567,185,97,239,7.033388,-18.969147,12480619,3,4,0.000000,Q65223375


In [4]:
### Extracting topic pages from database API ###


dataFrame = pd.read_csv(path + 'pages_Qids.csv')

# Match_topic_API = pd.DataFrame(columns = ['Qid', 'topic', 'probability', 'page_id', 'page_title', 'modularity_class', 'wiki_db'])
Match_topic_API = pd.DataFrame()


                
### The API's URL from the topic is extracted ###         
urls = "http://86.119.25.229:5000/api/v1/wikidata/topic?qid="
threshold = '0.1'

    
tmps1=time.time()
    
a=0 
n = dataFrame.index[-1]
for pageIndex in dataFrame.index:
    pageQid = dataFrame.at[pageIndex, 'Qid']
    pageModularity = dataFrame.at[pageIndex, 'modularity_class']
    response = requests.get(urls + pageQid + "&threshold=" + threshold).json()
    scores = list(findkeys(response, 'score'))
    topics = list(findkeys(response, 'topic'))
    
    try:
        page_title = response['name']
    except KeyError:
        page_title = dataFrame.at[pageIndex, 'Label']
    for i in range(len(scores)):
        page = dataFrame.iloc[pageIndex]
        page['Topic'] = topics[i]
        page['Probability'] = scores[i]
        Match_topic_API = Match_topic_API.append(page, ignore_index=True)
    print(a, "/",n)
    a+=1
    
    
# Clear the loading display
IPython.display.clear_output(wait=False)
tmps2=time.time()
print("Execution time = ",(tmps2-tmps1), " s")

# Save the results 
Match_topic_API.drop(columns = ['Unnamed: 0'], inplace = True )
Match_topic_API.to_csv(path + 'pages_topic.csv', encoding='utf-8')

Match_topic_API

Execution time =  2187.063416481018  s


Unnamed: 0,Degree,Id,Label,Probability,Qid,Topic,b,betweenesscentrality,g,modularity_class,r,size,x,y
0,3.0,61.0,Advanced Micro Devices,0.909917,Q128896,History_and_Society.Business_and_economics,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
1,3.0,61.0,Advanced Micro Devices,0.835494,Q128896,STEM.STEM*,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
2,3.0,61.0,Advanced Micro Devices,0.782673,Q128896,STEM.Computing,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
3,3.0,61.0,Advanced Micro Devices,0.392347,Q128896,STEM.Technology,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
4,3.0,61.0,Advanced Micro Devices,0.242178,Q128896,Geography.Regions.Americas.North_America,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22563,3.0,12480619.0,Saison 9 de Clem,0.938134,Q65223375,Culture.Media.Television,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22564,3.0,12480619.0,Saison 9 de Clem,0.000000,Q65223375,Geography.Regions.Europe.Europe*,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22565,3.0,12480619.0,Saison 9 de Clem,0.000000,Q65223375,Geography.Regions.Europe.Western_Europe,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22566,1.0,12697556.0,Christophe Pélissier,0.569863,Q66309156,Culture.Biography.Biography*,183.0,0.0,168.0,0.0,0.0,10.000000,15.158192,17.343578


In [66]:
### Counting the number of topics per cluster and weighting them ###
### Finding the 3 most weighted topics per cluster with the higher degree page ###

df = pd.read_csv(path + 'pages_topic.csv', index_col = 'Unnamed: 0')


modularity_df = pd.DataFrame({'Main_Topics':[], 'Higher_Weighted_Article':[]})

df = count_topic(df)
df = weight_topic(df)


df.sort_values(by = ['modularity_class', 'Weight'], inplace=True, ascending = [True, False])


df.to_csv(path + 'pages_main_topic.csv', encoding='utf-8')

total_weight = df[['Weight', 'modularity_class']].groupby(['modularity_class']).sum()
total_weight = total_weight.reindex(pd.RangeIndex(total_weight.index.max() + 1)).fillna(value=0)

nb_topic = int( df['modularity_class'].max() ) + 1
for i in range(nb_topic):
    
    # If the cluster has no topic or only Biography/Geography. Ignored for other cases
    if (len(df.loc[df['modularity_class'] == i]) == 0):
        mod = {'Main_Topics':'Biography/Geography', 'Higher_Weighted_Article':''}
        modularity_df = modularity_df.append(mod, ignore_index=True)
        continue 
        
    
    df.loc[df['modularity_class'] == i, 'Total Weight'] = total_weight.iloc[i].values
    df['Ratio Weight'] = df['Weight'] / df['Total Weight']
    
    
    list_topic = []
    str_main_topics = ''
    idx_topic = 0
    max_idx_topic = len(df.loc[df['modularity_class'] == i])
    while ( (len(list_topic) < 3) and (idx_topic < max_idx_topic) ):
        str_topic = df.loc[df['modularity_class'] == i, 'Subtopic'].values[idx_topic]
        if (str_topic not in list_topic):
            list_topic.append(str_topic)
            str_main_topics +=  '/ ' + str_topic + ' (' + str((df.loc[(df['modularity_class'] == i) & (df['Subtopic'] == str_topic), 'Degree Ratio'].values[0]).round(decimals=2)) + ') '
        idx_topic += 1
    str_main_topics = str_main_topics[2:]
    
    
    
  
    page_id_max_weight = df.loc[df['modularity_class'] == i, 'Weight'].idxmax()
    str_weight = df.loc[page_id_max_weight, 'Label'] + ' (' + str(int(df.loc[page_id_max_weight, 'Weight'])) + ')'
        
    mod = {'Main_Topics':str_main_topics, 'Higher_Weighted_Article':str_weight}
    modularity_df = modularity_df.append(mod, ignore_index=True)


df.rename_axis("Index", inplace=True)    
display(df)    

display(modularity_df)


df.to_csv(path + 'pages_main_topic.csv', encoding='utf-8')

modularity_df.to_csv(path + 'topic_by_degree.csv', encoding='utf-8')

Unnamed: 0_level_0,Degree,Id,Label,Probability,Qid,Topic,b,betweenesscentrality,g,modularity_class,...,x,y,Count,Total,Degree Ratio,Subtopic,Main topic,Weight,Total Weight,Ratio Weight
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5800,27.0,2077098.0,Carry On Abroad,0.992194,Q1305714,Culture.Media.Media*,65.0,31074.223361,15.0,0.0,...,23.794855,11.315010,281.0,2551.0,0.110153,Media*,Culture,7527.776759,345497.645873,0.021788
9603,25.0,6555711.0,Carry On Matron,0.996186,Q449215,Culture.Media.Media*,65.0,10727.910872,15.0,0.0,...,-9.459207,-15.256686,281.0,2551.0,0.110153,Media*,Culture,6998.203427,345497.645873,0.020255
7190,25.0,3078987.0,Carry On Girls,0.988678,Q1941014,Culture.Media.Media*,65.0,33680.425914,15.0,0.0,...,-4.103602,-11.444030,281.0,2551.0,0.110153,Media*,Culture,6945.464893,345497.645873,0.020103
5365,23.0,1790524.0,Carry On Camping,0.991948,Q449207,Culture.Media.Media*,65.0,3564.034693,15.0,0.0,...,16.415956,-23.133680,281.0,2551.0,0.110153,Media*,Culture,6410.959979,345497.645873,0.018556
7748,23.0,3694594.0,Carry On Regardless,0.982568,Q2783043,Culture.Media.Media*,65.0,19252.279594,15.0,0.0,...,17.325123,-21.941740,281.0,2551.0,0.110153,Media*,Culture,6350.334068,345497.645873,0.018380
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17517,4.0,43761938.0,Philomena Lee,0.998911,Q18044571,Culture.Biography.Biography*,190.0,0.333333,119.0,46.0,...,-23.616882,-16.140127,5.0,36.0,0.138889,Biography*,Culture,0.000000,28.626024,0.000000
17518,4.0,43761938.0,Philomena Lee,0.803184,Q18044571,Culture.Biography.Women,190.0,0.333333,119.0,46.0,...,-23.616882,-16.140127,1.0,36.0,0.027778,Women,Culture,0.000000,28.626024,0.000000
17520,4.0,43761938.0,Philomena Lee,0.459371,Q18044571,Geography.Regions.Europe.Northern_Europe,190.0,0.333333,119.0,46.0,...,-23.616882,-16.140127,6.0,36.0,0.166667,Northern_Europe,Geography,0.000000,28.626024,0.000000
17521,4.0,43761938.0,Philomena Lee,0.445540,Q18044571,STEM.STEM*,190.0,0.333333,119.0,46.0,...,-23.616882,-16.140127,2.0,36.0,0.055556,STEM*,STEM,0.000000,28.626024,0.000000


Unnamed: 0,Main_Topics,Higher_Weighted_Article
0,Media* (0.11) / Films (0.04) / Politics_and_go...,Carry On Abroad (7527)
1,Media* (0.21) / Films (0.13) / Literature (0.03),Swing Time (film) (113)
2,Visual_arts* (0.06) / Architecture (0.03) / Mi...,British Museum (842)
3,Politics_and_government (0.11) / Military_and_...,Jimmy Panetta (2247)
4,Sports (0.3) / Media* (0.03) / Education (0.02),College Football Playoff (71343)
5,Media* (0.29) / Music (0.24) / Television (0.02),Weezer (174)
6,Sports (0.29) / Media* (0.03) / Television (0.0),2015 Chicago Bears season (29653)
7,Media* (0.18) / Literature (0.09) / Books (0.03),Prozac Nation (17)
8,Sports (0.2) / Society (0.04) / Military_and_w...,2015 World Junior Ice Hockey Championships ros...
9,Media* (0.13) / Television (0.07) / Literature...,The A.B.C. Murders (1101)


In [67]:
### Saving to nodes attributes into the graph ###

graph = nx.read_graphml(path+'graph.graphml')

df = pd.read_csv(path + 'pages_main_topic.csv', index_col = 'Index')
nodes = pd.DataFrame()

df.sort_values(by = ['Id', 'Weight'], inplace=True, ascending = [True, False])

list_id = list(dict.fromkeys(df['Id']))

# Keeping only the node with the higher Weight
for Id in list_id:
    id_max = df.loc[df['Id'] == Id, 'Weight'].idxmax()
    nodes = nodes.append(df.loc[df.index == id_max], ignore_index=True)

    
display(nodes)

    
add_graph_attribute(graph, nodes, 'Qid')
add_graph_attribute(graph, nodes, 'Topic')
add_graph_attribute(graph, nodes, 'Main topic')
add_graph_attribute(graph, nodes, 'Subtopic')
add_graph_attribute(graph, nodes, 'Weight')


nodes.to_csv(path + 'filled_nodes.csv', encoding='utf-8')

nx.write_graphml_lxml(graph, path + 'filled_graph.graphml')  



Unnamed: 0,Degree,Id,Label,Probability,Qid,Topic,b,betweenesscentrality,g,modularity_class,...,x,y,Count,Total,Degree Ratio,Subtopic,Main topic,Weight,Total Weight,Ratio Weight
0,4.0,1624.0,Andrew Johnson,0.754925,Q8612,History_and_Society.Politics_and_government,225.0,42319.000000,71.0,3.0,...,-11.056788,11.840440,125.0,1117.0,0.111907,Politics_and_government,History_and_Society,377.462506,101708.759164,0.003711
1,3.0,1676.0,Alfonso XII of Spain,0.239359,Q32347,History_and_Society.Military_and_warfare,65.0,15884.000000,15.0,0.0,...,-12.763531,-8.452114,72.0,2551.0,0.028224,Military_and_warfare,History_and_Society,51.701619,345497.645873,0.000150
2,2.0,2238.0,Antipope John XXIII,0.658428,Q294599,History_and_Society.History,145.0,10590.000000,163.0,37.0,...,9.662183,-2.322066,12.0,232.0,0.051724,History,History_and_Society,15.802261,3519.285405,0.004490
3,6.0,3080.0,Alexandria,0.422515,Q87,History_and_Society.History,145.0,135255.140440,163.0,37.0,...,-11.561155,-24.925330,12.0,232.0,0.051724,History,History_and_Society,30.421055,3519.285405,0.008644
4,4.0,3112.0,Arianespace,0.637041,Q309867,History_and_Society.Transportation,192.0,31755.000000,192.0,26.0,...,-5.084677,-22.843502,19.0,309.0,0.061489,Transportation,History_and_Society,48.415100,3698.089128,0.013092
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5288,1.0,59513950.0,Kaappaan,0.994099,Q55784446,Culture.Media.Media*,192.0,0.000000,192.0,35.0,...,-7.935714,-6.040924,43.0,267.0,0.161049,Media*,Culture,42.746255,6081.646166,0.007029
5289,6.0,59516195.0,2018 MBC Entertainment Awards,0.777310,Q60776336,Culture.Media.Media*,0.0,3652.585365,122.0,11.0,...,-7.822823,12.803268,80.0,745.0,0.107383,Media*,Culture,373.108749,24825.724812,0.015029
5290,13.0,59518104.0,2021 World Junior Ice Hockey Championships,0.718604,Q60213024,Culture.Sports,156.0,0.000000,77.0,8.0,...,-10.691543,-19.281908,95.0,469.0,0.202559,Sports,Culture,887.476417,54768.802556,0.016204
5291,10.0,59529602.0,UFC 236,0.986787,Q60767232,Culture.Sports,175.0,808.121424,145.0,17.0,...,-18.224728,-21.509924,157.0,1415.0,0.110954,Sports,Culture,1549.255904,213428.362182,0.007259
