# Extraction of topics from Wikipedia pages

In [1]:
# https://github.com/taynaud/python-louvain

import IPython
import numpy as np
import networkx as nx
import requests
import pandas as pd
import csv
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import time
import community
from fonctions import *


pd.set_option('mode.chained_assignment', None)

path = 'Results/FR_20180816_20180831/'

region = "fr"

In [2]:
### Importing the nodes as a DataFrame for easier manipulations ###

# dataFrame = dataFrame = pd.read_csv(path + 'nodes.csv', encoding='utf-8')

graph = nx.read_graphml(path+'graph.graphml')
dataFrame = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index')
dataFrame['Id'] = dataFrame.index
dataFrame.rename(columns={'label': 'Label'}, inplace=True)
dataFrame['Label'] = dataFrame['Label'].astype('string')
dataFrame['Id'] = dataFrame['Id'].astype('int64')
# dataFrame.drop(columns=['size', 'r', 'g', 'b', 'x', 'y'], inplace=True)

degree = dict(nx.degree(graph, nbunch=None, weight=10))
partition = community.best_partition(graph, randomize=True, weight='Weight', resolution=1.5)
btw_cent = nx.betweenness_centrality(graph,normalized=False)
                           

dataFrame['Degree'] = pd.DataFrame.from_dict(degree, orient='index')[0]
dataFrame['modularity_class'] = pd.DataFrame.from_dict(partition, orient='index')[0]
dataFrame['betweenesscentrality'] = pd.DataFrame.from_dict(btw_cent, orient='index')[0]

add_graph_attribute(graph, dataFrame, 'Degree')
add_graph_attribute(graph, dataFrame, 'modularity_class')
add_graph_attribute(graph, dataFrame, 'betweenesscentrality')

dataFrame.sort_values(by = ['modularity_class'], inplace=True, ascending = [True])
dataFrame.sort_values(by = ['Id'], inplace=True, ascending = [True])
dataFrame.to_csv(path + 'nodes.csv', encoding='utf-8')

print(dataFrame.info())
display(dataFrame)

print("Number of clusters: ", dataFrame['modularity_class'].max())

<class 'pandas.core.frame.DataFrame'>
Index: 4834 entries, 61 to 12697556
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Label                 4834 non-null   string 
 1   size                  4834 non-null   float64
 2   r                     4834 non-null   int64  
 3   g                     4834 non-null   int64  
 4   b                     4834 non-null   int64  
 5   x                     4834 non-null   float64
 6   y                     4834 non-null   float64
 7   Id                    4834 non-null   int64  
 8   Degree                4834 non-null   int64  
 9   modularity_class      4834 non-null   int64  
 10  betweenesscentrality  4834 non-null   float64
dtypes: float64(4), int64(6), string(1)
memory usage: 453.2+ KB
None


Unnamed: 0,Label,size,r,g,b,x,y,Id,Degree,modularity_class,betweenesscentrality
61,Advanced_Micro_Devices,12.686567,192,192,192,9.234199,-16.520315,61,3,7,0.000000
103,Aton,20.746270,0,134,135,-10.841246,0.702035,103,9,13,2101.806936
206,Alfred_Nobel,11.343284,114,121,10,22.848211,24.862890,206,2,16,22046.317749
316,Lampe_à_incandescence_halogène,10.000000,114,121,10,-14.543655,-12.765390,316,1,16,0.000000
341,Amenhotep_III,31.492537,0,134,135,-10.848696,-14.589257,341,17,13,5114.899646
...,...,...,...,...,...,...,...,...,...,...,...
11983100,Thierry_Coste,23.432835,193,77,88,6.475411,-18.242767,11983100,11,8,46361.497269
11986417,Ugo_Humbert,14.029851,192,192,192,-8.752216,9.236148,11986417,4,7,0.000000
12470864,Équarrissage,10.000000,235,44,170,6.380493,19.105633,12470864,1,31,0.000000
12480619,Saison_9_de_Clem,12.686567,185,97,239,7.033388,-18.969147,12480619,3,4,0.000000


Number of clusters:  40


In [3]:
### Extract nodes labels from Wikipedia queries ###



# URL for the quieries
urls = "https://"+region+".wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&pageids="


i=0
Nb_pages = len(dataFrame)


### Associate the 'Qid' value of Wikipedia pages in the DataFrame ###
### Give '-1' value if an error has occured during the query      ###

for pageId in np.int64(dataFrame['Id']):
    response = requests.get(urls + str(pageId)).json()
    try:
        Qid = (list(findkeys(response, 'wikibase_item'))[0])
        Title = (list(findkeys(response, 'title'))[0])
    except IndexError:
        dataFrame.loc[dataFrame['Id'] == pageId, 'Qid'] = '-1'
        i+=1
        print(i,"/", Nb_pages, dataFrame.loc[dataFrame['Id'] == pageId, 'Label'], "\t error")
    else:
        dataFrame.loc[dataFrame['Id'] == pageId, 'Qid'] = Qid
        dataFrame.loc[dataFrame['Id'] == pageId, 'Label'] = Title
        i+=1
        print(i,"/", Nb_pages, dataFrame.loc[dataFrame['Id'] == pageId, 'Label'].values[0])

# Clear the loading display
IPython.display.clear_output(wait=False)

        
# Save the DataFrame with their associated Qids
dataFrame.to_csv(path + 'pages_Qids.csv', encoding='utf-8')

dataFrame

Unnamed: 0,Label,size,r,g,b,x,y,Id,Degree,modularity_class,betweenesscentrality,Qid
61,Advanced Micro Devices,12.686567,192,192,192,9.234199,-16.520315,61,3,7,0.000000,Q128896
103,Aton,20.746270,0,134,135,-10.841246,0.702035,103,9,13,2101.806936,Q130227
206,Alfred Nobel,11.343284,114,121,10,22.848211,24.862890,206,2,16,22046.317749,Q23810
316,Lampe à incandescence halogène,10.000000,114,121,10,-14.543655,-12.765390,316,1,16,0.000000,Q1114190
341,Amenhotep III,31.492537,0,134,135,-10.848696,-14.589257,341,17,13,5114.899646,Q42606
...,...,...,...,...,...,...,...,...,...,...,...,...
11983100,Thierry Coste,23.432835,193,77,88,6.475411,-18.242767,11983100,11,8,46361.497269,Q56280447
11986417,Ugo Humbert,14.029851,192,192,192,-8.752216,9.236148,11986417,4,7,0.000000,Q54007306
12470864,Équarrissage,10.000000,235,44,170,6.380493,19.105633,12470864,1,31,0.000000,Q62649386
12480619,Saison 9 de Clem,12.686567,185,97,239,7.033388,-18.969147,12480619,3,4,0.000000,Q65223375


In [4]:
### Extracting topic pages from database API ###


dataFrame = pd.read_csv(path + 'pages_Qids.csv')

# Match_topic_API = pd.DataFrame(columns = ['Qid', 'topic', 'probability', 'page_id', 'page_title', 'modularity_class', 'wiki_db'])
Match_topic_API = pd.DataFrame()


                
### The API's URL from the topic is extracted ###         
urls = "http://86.119.25.229:5000/api/v1/wikidata/topic?qid="
threshold = '0.1'

    
tmps1=time.time()
    
a=0 
n = dataFrame.index[-1]
for pageIndex in dataFrame.index:
    pageQid = dataFrame.at[pageIndex, 'Qid']
    pageModularity = dataFrame.at[pageIndex, 'modularity_class']
    response = requests.get(urls + pageQid + "&threshold=" + threshold).json()
    scores = list(findkeys(response, 'score'))
    topics = list(findkeys(response, 'topic'))
    
    try:
        page_title = response['name']
    except KeyError:
        page_title = dataFrame.at[pageIndex, 'Label']
    for i in range(len(scores)):
        page = dataFrame.iloc[pageIndex]
        page['Topic'] = topics[i]
        page['Probability'] = scores[i]
        Match_topic_API = Match_topic_API.append(page, ignore_index=True)
    print(a, "/",n)
    a+=1
    
    
# Clear the loading display
IPython.display.clear_output(wait=False)
tmps2=time.time()
print("Execution time = ",(tmps2-tmps1), " s")

# Save the results 
Match_topic_API.drop(columns = ['Unnamed: 0'], inplace = True )
Match_topic_API.to_csv(path + 'pages_topic.csv', encoding='utf-8')

Match_topic_API

Execution time =  2187.063416481018  s


Unnamed: 0,Degree,Id,Label,Probability,Qid,Topic,b,betweenesscentrality,g,modularity_class,r,size,x,y
0,3.0,61.0,Advanced Micro Devices,0.909917,Q128896,History_and_Society.Business_and_economics,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
1,3.0,61.0,Advanced Micro Devices,0.835494,Q128896,STEM.STEM*,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
2,3.0,61.0,Advanced Micro Devices,0.782673,Q128896,STEM.Computing,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
3,3.0,61.0,Advanced Micro Devices,0.392347,Q128896,STEM.Technology,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
4,3.0,61.0,Advanced Micro Devices,0.242178,Q128896,Geography.Regions.Americas.North_America,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22563,3.0,12480619.0,Saison 9 de Clem,0.938134,Q65223375,Culture.Media.Television,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22564,3.0,12480619.0,Saison 9 de Clem,0.000000,Q65223375,Geography.Regions.Europe.Europe*,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22565,3.0,12480619.0,Saison 9 de Clem,0.000000,Q65223375,Geography.Regions.Europe.Western_Europe,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22566,1.0,12697556.0,Christophe Pélissier,0.569863,Q66309156,Culture.Biography.Biography*,183.0,0.0,168.0,0.0,0.0,10.000000,15.158192,17.343578


In [5]:
### Counting the number of topics per cluster and weighting them ###
### Finding the 3 most weighted topics per cluster with the higher degree page ###

df = pd.read_csv(path + 'pages_topic.csv', index_col = 'Unnamed: 0')


modularity_df = pd.DataFrame({'Main_Topics':[], 'Higher_Weighted_Article':[]})

df = count_topic(df)
df = weight_topic(df)


df.sort_values(by = ['modularity_class', 'Weight'], inplace=True, ascending = [True, False])


df.to_csv(path + 'pages_main_topic.csv', encoding='utf-8')

total_weight = df[['Weight', 'modularity_class']].groupby(['modularity_class']).sum()
total_weight = total_weight.reindex(pd.RangeIndex(total_weight.index.max() + 1)).fillna(value=0)

nb_topic = int( df['modularity_class'].max() ) + 1
for i in range(nb_topic):
    
    # If the cluster has no topic or only Biography/Geography. Ignored for other cases
    if (len(df.loc[df['modularity_class'] == i]) == 0):
        mod = {'Main_Topics':'Biography/Geography', 'Higher_Weighted_Article':''}
        modularity_df = modularity_df.append(mod, ignore_index=True)
        continue 
        
    
    df.loc[df['modularity_class'] == i, 'Total Weight'] = total_weight.iloc[i].values
    df['Ratio Weight'] = df['Weight'] / df['Total Weight']
    
    
    list_topic = []
    str_main_topics = ''
    idx_topic = 0
    max_idx_topic = len(df.loc[df['modularity_class'] == i])
    while ( (len(list_topic) < 3) and (idx_topic < max_idx_topic) ):
        str_topic = df.loc[df['modularity_class'] == i, 'Subtopic'].values[idx_topic]
#         if ( (str_topic not in list_topic) and ( df.loc[df['Subtopic'] == str_topic, 'Degree Ratio'].values[0] >= 0.1) ):
        if (str_topic not in list_topic):
            list_topic.append(str_topic)
            str_main_topics +=  '/ ' + str_topic + ' (' + str((df.loc[(df['modularity_class'] == i) & (df['Subtopic'] == str_topic), 'Degree Ratio'].values[0]).round(decimals=2)) + ') '
        idx_topic += 1
    str_main_topics = str_main_topics[2:]
    
    
    
  
    page_id_max_weight = df.loc[df['modularity_class'] == i, 'Weight'].idxmax()
    str_weight = df.loc[page_id_max_weight, 'Label'] + ' (' + str(int(df.loc[page_id_max_weight, 'Weight'])) + ')'
        
    mod = {'Main_Topics':str_main_topics, 'Higher_Weighted_Article':str_weight}
    modularity_df = modularity_df.append(mod, ignore_index=True)


df.rename_axis("Index", inplace=True)    
display(df)    

display(modularity_df)


df.to_csv(path + 'pages_main_topic.csv', encoding='utf-8')

modularity_df.to_csv(path + 'topic_by_degree.csv', encoding='utf-8')

Unnamed: 0_level_0,Degree,Id,Label,Probability,Qid,Topic,b,betweenesscentrality,g,modularity_class,...,x,y,Count,Total,Degree Ratio,Subtopic,Main topic,Weight,Total Weight,Ratio Weight
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20392,46.0,8832723.0,Ligue Europa 2015-2016,0.997378,Q15699191,Culture.Sports,183.0,256312.982905,168.0,0.0,...,10.280470,-19.780142,593.0,2562.0,0.231460,Sports,Culture,27206.486612,1.438008e+06,0.018920
20510,43.0,9176589.0,Ligue Europa 2016-2017,0.999522,Q18740965,Culture.Sports,183.0,251813.992847,168.0,0.0,...,-22.432297,12.595013,593.0,2562.0,0.231460,Sports,Culture,25486.815292,1.438008e+06,0.017724
17426,38.0,4841475.0,Championnat de France de football de National 2,0.822199,Q1061291,Culture.Sports,0.0,184938.056551,163.0,0.0,...,21.288603,10.837981,593.0,2562.0,0.231460,Sports,Culture,18527.436003,1.438008e+06,0.012884
2996,28.0,78008.0,Coupe de la Ligue anglaise de football,0.997894,Q11152,Culture.Sports,183.0,125370.286004,168.0,0.0,...,12.424314,24.239010,593.0,2562.0,0.231460,Sports,Culture,16569.036743,1.438008e+06,0.011522
4848,27.0,155996.0,Valenciennes Football Club,0.771854,Q212269,Culture.Sports,0.0,224630.098639,163.0,0.0,...,22.299297,-8.519827,593.0,2562.0,0.231460,Sports,Culture,12358.146494,1.438008e+06,0.008594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8279,3.0,430383.0,Lothar von Trotha,0.448679,Q65898,Geography.Regions.Europe.Europe*,220.0,0.000000,160.0,40.0,...,-11.817418,-8.908286,2.0,24.0,0.083333,Europe*,Geography,0.000000,3.870772e+01,0.000000
8280,3.0,430383.0,Lothar von Trotha,0.000000,Q65898,Geography.Regions.Africa.Africa*,220.0,0.000000,160.0,40.0,...,-11.817418,-8.908286,4.0,24.0,0.166667,Africa*,Geography,0.000000,3.870772e+01,0.000000
12144,4.0,1097429.0,Héréros,0.406322,Q170088,Geography.Regions.Africa.Africa*,220.0,14490.000000,160.0,40.0,...,-12.905053,24.918957,4.0,24.0,0.166667,Africa*,Geography,0.000000,3.870772e+01,0.000000
12149,4.0,1097429.0,Héréros,0.000000,Q170088,Geography.Geographical,220.0,14490.000000,160.0,40.0,...,-12.905053,24.918957,1.0,24.0,0.041667,Geographical,Geography,0.000000,3.870772e+01,0.000000


Unnamed: 0,Main_Topics,Higher_Weighted_Article
0,Sports (0.23) / Media* (0.01) / Society (0.01),Ligue Europa 2015-2016 (27206)
1,Media* (0.1) / Films (0.04) / Military_and_war...,Paris brûle-t-il ? (film) (4441)
2,Sports (0.1) / Politics_and_government (0.03) ...,Tournoi de tennis de Miami (306)
3,Sports (0.22) / Visual_arts* (0.01) / Architec...,Rugby à XV (5376)
4,Media* (0.11) / Television (0.05) / Films (0.02),Les Cinq Dernières Minutes (2585)
5,Media* (0.15) / Music (0.1) / Society (0.02),Pop (musique) (12200)
6,Politics_and_government (0.08) / Society (0.07...,Organisation des Nations unies (1831)
7,Sports (0.21) / Society (0.02) / Business_and_...,Nicolas Mahut (3142)
8,Politics_and_government (0.07) / Media* (0.05)...,Élection présidentielle française de 2012 (15616)
9,Sports (0.04) / Media* (0.03) / STEM* (0.04),Championnats du monde de cyclisme sur route (178)


In [6]:
### Saving to nodes attributes into the graph ###

df = pd.read_csv(path + 'pages_main_topic.csv', index_col = 'Index')
nodes = pd.DataFrame()

df.sort_values(by = ['Id', 'Weight'], inplace=True, ascending = [True, False])

list_id = list(dict.fromkeys(df['Id']))

# Keeping only the node with the higher Weight
for Id in list_id:
    id_max = df.loc[df['Id'] == Id, 'Weight'].idxmax()
    nodes = nodes.append(df.loc[df.index == id_max], ignore_index=True)

    
display(nodes)

    
add_graph_attribute(graph, nodes, 'Qid')
add_graph_attribute(graph, nodes, 'Topic')
add_graph_attribute(graph, nodes, 'Main topic')
add_graph_attribute(graph, nodes, 'Subtopic')
add_graph_attribute(graph, nodes, 'Weight')


nodes.to_csv(path + 'filled_nodes.csv', encoding='utf-8')

nx.write_graphml_lxml(graph, path + 'filled_graph.graphml')  



Unnamed: 0,Degree,Id,Label,Probability,Qid,Topic,b,betweenesscentrality,g,modularity_class,...,x,y,Count,Total,Degree Ratio,Subtopic,Main topic,Weight,Total Weight,Ratio Weight
0,3.0,61.0,Advanced Micro Devices,0.909917,Q128896,History_and_Society.Business_and_economics,192.0,0.000000,192.0,7.0,...,9.234199,-16.520315,2.0,410.0,0.004878,Business_and_economics,History_and_Society,5.459502,8.803993e+04,0.000062
1,9.0,103.0,Aton,0.936295,Q130227,History_and_Society.History,135.0,2101.806936,134.0,13.0,...,-10.841246,0.702035,43.0,303.0,0.141914,History,History_and_Society,362.346178,1.353902e+04,0.026763
2,2.0,206.0,Alfred Nobel,0.294225,Q23810,STEM.STEM*,10.0,22046.317749,121.0,16.0,...,22.848211,24.862890,92.0,592.0,0.155405,STEM*,STEM,54.137396,4.797915e+04,0.001128
3,1.0,316.0,Lampe à incandescence halogène,0.822199,Q1114190,STEM.STEM*,10.0,0.000000,121.0,16.0,...,-14.543655,-12.765390,92.0,592.0,0.155405,STEM*,STEM,75.642323,4.797915e+04,0.001577
4,17.0,341.0,Amenhotep III,0.978395,Q42606,History_and_Society.History,135.0,5114.899646,134.0,13.0,...,-10.848696,-14.589257,43.0,303.0,0.141914,History,History_and_Society,715.206516,1.353902e+04,0.052826
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4827,11.0,11983100.0,Thierry Coste,0.970698,Q56280447,Culture.Biography.Biography*,88.0,46361.497269,77.0,8.0,...,6.475411,-18.242767,371.0,3674.0,0.100980,Biography*,Culture,0.000000,5.648353e+05,0.000000
4828,4.0,11986417.0,Ugo Humbert,0.985506,Q54007306,Culture.Sports,192.0,0.000000,192.0,7.0,...,-8.752216,9.236148,85.0,410.0,0.207317,Sports,Culture,335.072181,8.803993e+04,0.003806
4829,1.0,12470864.0,Équarrissage,0.112805,Q62649386,History_and_Society.Military_and_warfare,170.0,0.000000,44.0,31.0,...,6.380493,19.105633,19.0,171.0,0.111111,Military_and_warfare,History_and_Society,2.143303,1.643471e+03,0.001304
4830,3.0,12480619.0,Saison 9 de Clem,0.974831,Q65223375,Culture.Media.Media*,239.0,0.000000,97.0,4.0,...,7.033388,-18.969147,125.0,1136.0,0.110035,Media*,Culture,365.561686,4.963712e+04,0.007365
