# Extraction of topics from Wikipedia pages

In [87]:
# https://github.com/taynaud/python-louvain

import IPython
import numpy as np
import networkx as nx
import requests
import pandas as pd
import csv
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import time
import community
from fonctions import *


pd.set_option('mode.chained_assignment', None)

path = 'Results/FR_20181216_20181231/'

region = "en"

In [2]:
### Importing the nodes as a DataFrame for easier manipulations ###

# dataFrame = dataFrame = pd.read_csv(path + 'nodes.csv', encoding='utf-8')

graph = nx.read_graphml(path+'graph.graphml')
dataFrame = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index')
dataFrame['Id'] = dataFrame.index
dataFrame.rename(columns={'label': 'Label'}, inplace=True)
dataFrame['Label'] = dataFrame['Label'].astype('string')
dataFrame['Id'] = dataFrame['Id'].astype('int64')
# dataFrame.drop(columns=['size', 'r', 'g', 'b', 'x', 'y'], inplace=True)

degree = dict(nx.degree(graph, nbunch=None, weight=10))
partition = community.best_partition(graph, randomize=True, weight='Weight', resolution=1.5)
btw_cent = nx.betweenness_centrality(graph,normalized=False)
                           

dataFrame['Degree'] = pd.DataFrame.from_dict(degree, orient='index')[0]
dataFrame['modularity_class'] = pd.DataFrame.from_dict(partition, orient='index')[0]
dataFrame['betweenesscentrality'] = pd.DataFrame.from_dict(btw_cent, orient='index')[0]

# add_graph_attribute(graph, dataFrame, 'Degree')
# add_graph_attribute(graph, dataFrame, 'modularity_class')
# add_graph_attribute(graph, dataFrame, 'betweenesscentrality')

dataFrame.sort_values(by = ['modularity_class'], inplace=True, ascending = [True])
dataFrame.sort_values(by = ['Id'], inplace=True, ascending = [True])
dataFrame.to_csv(path + 'nodes.csv', encoding='utf-8')

print(dataFrame.info())
display(dataFrame)

print("Number of clusters: ", dataFrame['modularity_class'].max())

<class 'pandas.core.frame.DataFrame'>
Index: 4834 entries, 61 to 12697556
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Label                 4834 non-null   string 
 1   size                  4834 non-null   float64
 2   r                     4834 non-null   int64  
 3   g                     4834 non-null   int64  
 4   b                     4834 non-null   int64  
 5   x                     4834 non-null   float64
 6   y                     4834 non-null   float64
 7   Id                    4834 non-null   int64  
 8   Degree                4834 non-null   int64  
 9   modularity_class      4834 non-null   int64  
 10  betweenesscentrality  4834 non-null   float64
dtypes: float64(4), int64(6), string(1)
memory usage: 453.2+ KB
None


Unnamed: 0,Label,size,r,g,b,x,y,Id,Degree,modularity_class,betweenesscentrality
61,Advanced_Micro_Devices,12.686567,192,192,192,9.234199,-16.520315,61,3,7,0.000000
103,Aton,20.746270,0,134,135,-10.841246,0.702035,103,9,13,2101.806936
206,Alfred_Nobel,11.343284,114,121,10,22.848211,24.862890,206,2,16,22046.317749
316,Lampe_à_incandescence_halogène,10.000000,114,121,10,-14.543655,-12.765390,316,1,16,0.000000
341,Amenhotep_III,31.492537,0,134,135,-10.848696,-14.589257,341,17,13,5114.899646
...,...,...,...,...,...,...,...,...,...,...,...
11983100,Thierry_Coste,23.432835,193,77,88,6.475411,-18.242767,11983100,11,8,46361.497269
11986417,Ugo_Humbert,14.029851,192,192,192,-8.752216,9.236148,11986417,4,7,0.000000
12470864,Équarrissage,10.000000,235,44,170,6.380493,19.105633,12470864,1,31,0.000000
12480619,Saison_9_de_Clem,12.686567,185,97,239,7.033388,-18.969147,12480619,3,4,0.000000


Number of clusters:  40


In [3]:
### Extract nodes labels from Wikipedia queries ###



# URL for the quieries
urls = "https://"+region+".wikipedia.org/w/api.php?action=query&prop=pageprops&format=json&pageids="


i=0
Nb_pages = len(dataFrame)


### Associate the 'Qid' value of Wikipedia pages in the DataFrame ###
### Give '-1' value if an error has occured during the query      ###

for pageId in np.int64(dataFrame['Id']):
    response = requests.get(urls + str(pageId)).json()
    try:
        Qid = (list(findkeys(response, 'wikibase_item'))[0])
        Title = (list(findkeys(response, 'title'))[0])
    except IndexError:
        dataFrame.loc[dataFrame['Id'] == pageId, 'Qid'] = '-1'
        i+=1
        print(i,"/", Nb_pages, dataFrame.loc[dataFrame['Id'] == pageId, 'Label'], "\t error")
    else:
        dataFrame.loc[dataFrame['Id'] == pageId, 'Qid'] = Qid
        dataFrame.loc[dataFrame['Id'] == pageId, 'Label'] = Title
        i+=1
        print(i,"/", Nb_pages, dataFrame.loc[dataFrame['Id'] == pageId, 'Label'].values[0])

# Clear the loading display
IPython.display.clear_output(wait=False)

        
# Save the DataFrame with their associated Qids
dataFrame.to_csv(path + 'pages_Qids.csv', encoding='utf-8')

dataFrame

Unnamed: 0,Label,size,r,g,b,x,y,Id,Degree,modularity_class,betweenesscentrality,Qid
61,Advanced Micro Devices,12.686567,192,192,192,9.234199,-16.520315,61,3,7,0.000000,Q128896
103,Aton,20.746270,0,134,135,-10.841246,0.702035,103,9,13,2101.806936,Q130227
206,Alfred Nobel,11.343284,114,121,10,22.848211,24.862890,206,2,16,22046.317749,Q23810
316,Lampe à incandescence halogène,10.000000,114,121,10,-14.543655,-12.765390,316,1,16,0.000000,Q1114190
341,Amenhotep III,31.492537,0,134,135,-10.848696,-14.589257,341,17,13,5114.899646,Q42606
...,...,...,...,...,...,...,...,...,...,...,...,...
11983100,Thierry Coste,23.432835,193,77,88,6.475411,-18.242767,11983100,11,8,46361.497269,Q56280447
11986417,Ugo Humbert,14.029851,192,192,192,-8.752216,9.236148,11986417,4,7,0.000000,Q54007306
12470864,Équarrissage,10.000000,235,44,170,6.380493,19.105633,12470864,1,31,0.000000,Q62649386
12480619,Saison 9 de Clem,12.686567,185,97,239,7.033388,-18.969147,12480619,3,4,0.000000,Q65223375


In [4]:
### Extracting topic pages from database API ###


dataFrame = pd.read_csv(path + 'pages_Qids.csv')

# Match_topic_API = pd.DataFrame(columns = ['Qid', 'topic', 'probability', 'page_id', 'page_title', 'modularity_class', 'wiki_db'])
Match_topic_API = pd.DataFrame()


                
### The API's URL from the topic is extracted ###         
urls = "http://86.119.25.229:5000/api/v1/wikidata/topic?qid="
threshold = '0.1'

    
tmps1=time.time()
    
a=0 
n = dataFrame.index[-1]
for pageIndex in dataFrame.index:
    pageQid = dataFrame.at[pageIndex, 'Qid']
    pageModularity = dataFrame.at[pageIndex, 'modularity_class']
    response = requests.get(urls + pageQid + "&threshold=" + threshold).json()
    scores = list(findkeys(response, 'score'))
    topics = list(findkeys(response, 'topic'))
    
    try:
        page_title = response['name']
    except KeyError:
        page_title = dataFrame.at[pageIndex, 'Label']
    for i in range(len(scores)):
        page = dataFrame.iloc[pageIndex]
        page['Topic'] = topics[i]
        page['Probability'] = scores[i]
        Match_topic_API = Match_topic_API.append(page, ignore_index=True)
    print(a, "/",n)
    a+=1
    
    
# Clear the loading display
IPython.display.clear_output(wait=False)
tmps2=time.time()
print("Execution time = ",(tmps2-tmps1), " s")

# Save the results 
Match_topic_API.drop(columns = ['Unnamed: 0'], inplace = True )
Match_topic_API.to_csv(path + 'pages_topic.csv', encoding='utf-8')

Match_topic_API

Execution time =  2187.063416481018  s


Unnamed: 0,Degree,Id,Label,Probability,Qid,Topic,b,betweenesscentrality,g,modularity_class,r,size,x,y
0,3.0,61.0,Advanced Micro Devices,0.909917,Q128896,History_and_Society.Business_and_economics,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
1,3.0,61.0,Advanced Micro Devices,0.835494,Q128896,STEM.STEM*,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
2,3.0,61.0,Advanced Micro Devices,0.782673,Q128896,STEM.Computing,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
3,3.0,61.0,Advanced Micro Devices,0.392347,Q128896,STEM.Technology,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
4,3.0,61.0,Advanced Micro Devices,0.242178,Q128896,Geography.Regions.Americas.North_America,192.0,0.0,192.0,7.0,192.0,12.686567,9.234199,-16.520315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22563,3.0,12480619.0,Saison 9 de Clem,0.938134,Q65223375,Culture.Media.Television,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22564,3.0,12480619.0,Saison 9 de Clem,0.000000,Q65223375,Geography.Regions.Europe.Europe*,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22565,3.0,12480619.0,Saison 9 de Clem,0.000000,Q65223375,Geography.Regions.Europe.Western_Europe,239.0,0.0,97.0,4.0,185.0,12.686567,7.033388,-18.969147
22566,1.0,12697556.0,Christophe Pélissier,0.569863,Q66309156,Culture.Biography.Biography*,183.0,0.0,168.0,0.0,0.0,10.000000,15.158192,17.343578


In [88]:
### Counting the number of topics per cluster and weighting them ###
### Finding the 3 most weighted topics per cluster with the higher degree page ###

df = pd.read_csv(path + 'pages_topic.csv', index_col = 'Unnamed: 0')


modularity_df = pd.DataFrame({'Main_Topics':[], 'Higher_Weighted_Article':[]})

df = count_topic(df)
df = weight_topic(df)


df.sort_values(by = ['modularity_class', 'Weight'], inplace=True, ascending = [True, False])


df.to_csv(path + 'pages_main_topic.csv', encoding='utf-8')

total_weight = df[['Weight', 'modularity_class']].groupby(['modularity_class']).sum()
total_weight = total_weight.reindex(pd.RangeIndex(total_weight.index.max() + 1)).fillna(value=0)

nb_topic = int( df['modularity_class'].max() ) + 1
for i in range(nb_topic):
    
    # If the cluster has no topic or only Biography/Geography. Ignored for other cases
    if (len(df.loc[df['modularity_class'] == i]) == 0):
        mod = {'Main_Topics':'Biography/Geography', 'Higher_Weighted_Article':''}
        modularity_df = modularity_df.append(mod, ignore_index=True)
        continue 
        
    
    df.loc[df['modularity_class'] == i, 'Total Weight'] = total_weight.iloc[i].values
    df['Ratio Weight'] = df['Weight'] / df['Total Weight']
    
    
    list_topic = []
    str_main_topics = ''
    idx_topic = 0
    max_idx_topic = len(df.loc[df['modularity_class'] == i])
    while ( (len(list_topic) < 3) and (idx_topic < max_idx_topic) ):
        str_topic = df.loc[df['modularity_class'] == i, 'Subtopic'].values[idx_topic]
        if (str_topic not in list_topic):
            list_topic.append(str_topic)
            str_main_topics +=  '/ ' + str_topic + ' (' + str((df.loc[(df['modularity_class'] == i) & (df['Subtopic'] == str_topic), 'Degree Ratio'].values[0]).round(decimals=2)) + ') '
        idx_topic += 1
    str_main_topics = str_main_topics[2:]
    
    
    
  
    page_id_max_weight = df.loc[df['modularity_class'] == i, 'Weight'].idxmax()
    str_weight = df.loc[page_id_max_weight, 'Label'] + ' (' + str(int(df.loc[page_id_max_weight, 'Weight'])) + ')'
        
    mod = {'Main_Topics':str_main_topics, 'Higher_Weighted_Article':str_weight}
    modularity_df = modularity_df.append(mod, ignore_index=True)


df.rename_axis("Index", inplace=True)    
display(df)    

display(modularity_df)


df.to_csv(path + 'pages_main_topic.csv', encoding='utf-8')

modularity_df.to_csv(path + 'topic_by_degree.csv', encoding='utf-8')



Unnamed: 0_level_0,Degree,Id,Label,Modularity Class,Probability,Qid,Topic,b,betweenesscentrality,g,...,x,y,Count,Total,Degree Ratio,Subtopic,Main topic,Weight,Total Weight,Ratio Weight
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4047,10.0,4710854.0,Men in Black 3,11.0,0.996633,Q327713,Culture.Media.Media*,13.0,3661.964530,95.0,...,18.356537,-11.146811,34.0,243.0,0.139918,Media*,Culture,338.855360,3582.865565,0.094577
2458,6.0,882112.0,Les Valeurs de la famille Addams,11.0,0.997459,Q204725,Culture.Media.Media*,13.0,3191.215691,95.0,...,-2.558718,1.323892,34.0,243.0,0.139918,Media*,Culture,203.481659,3582.865565,0.056793
1232,6.0,268902.0,Men in Black (univers de fiction),11.0,0.962683,Q6816260,Culture.Media.Media*,13.0,662.204281,95.0,...,-17.605244,-3.038025,34.0,243.0,0.139918,Media*,Culture,196.387361,3582.865565,0.054813
4046,10.0,4710854.0,Men in Black 3,11.0,0.997209,Q327713,Culture.Media.Films,13.0,3661.964530,95.0,...,18.356537,-11.146811,14.0,243.0,0.057613,Films,Culture,139.609270,3582.865565,0.038966
4709,4.0,7462852.0,Mr Selfridge,11.0,0.975587,Q4127467,Culture.Media.Media*,13.0,1651.000000,95.0,...,-5.574798,8.121538,34.0,243.0,0.139918,Media*,Culture,132.679825,3582.865565,0.037032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5135,7.0,10655252.0,Orlane Kanor,9.0,0.114098,Q28803922,Geography.Regions.Europe.Europe*,116.0,0.714286,157.0,...,-2.078572,10.376554,19.0,94.0,0.202128,Europe*,Geography,0.000000,1735.266002,0.000000
5264,7.0,12105251.0,Pauletta Foppa,9.0,0.997029,Q57409544,Culture.Biography.Biography*,116.0,317.684524,157.0,...,-18.402145,20.109282,16.0,94.0,0.170213,Biography*,Culture,0.000000,1735.266002,0.000000
5265,7.0,12105251.0,Pauletta Foppa,9.0,0.129413,Q57409544,Culture.Biography.Women,116.0,317.684524,157.0,...,-18.402145,20.109282,13.0,94.0,0.138298,Women,Culture,0.000000,1735.266002,0.000000
5267,7.0,12105251.0,Pauletta Foppa,9.0,0.000000,Q57409544,Geography.Regions.Europe.Europe*,116.0,317.684524,157.0,...,-18.402145,20.109282,19.0,94.0,0.202128,Europe*,Geography,0.000000,1735.266002,0.000000


Unnamed: 0,Main_Topics,Higher_Weighted_Article
0,Media* (0.14) / Films (0.06) / Literature (0.07),Men in Black 3 (338)
1,Media* (0.15) / Music (0.12) / Performing_arts...,"Âge tendre, la tournée des idoles (474)"
2,Media* (0.11) / Visual_arts* (0.04) / Fashion ...,Paris Première (225)
3,Media* (0.15) / Films (0.1) / Literature (0.06),Shrek 2 (439)
4,Society (0.06) / Media* (0.07) / History (0.05),Alexandre II (empereur de Russie) (170)
5,Media* (0.13) / Films (0.06) / History (0.04),"Guerre et Paix (film, 1956) (1608)"
6,Sports (0.06) / Media* (0.03) / Visual_arts* (...,Tournoi de Wimbledon (156)
7,Sports (0.15) / History (0.03) / Military_and_...,Southampton Football Club (475)
8,Sports (0.14) / Media* (0.03) / History (0.02),ASM Clermont Auvergne (834)
9,Visual_arts* (0.15) / Fashion (0.15) / History...,Miss Tahiti (6077)


In [89]:
### Saving to nodes attributes into the graph ###

graph = nx.read_graphml(path+'graph.graphml')

df = pd.read_csv(path + 'pages_main_topic.csv', index_col = 'Index')
nodes = pd.DataFrame()

df.sort_values(by = ['Id', 'Weight'], inplace=True, ascending = [True, False])

list_id = list(dict.fromkeys(df['Id']))

# Keeping only the node with the higher Weight
for Id in list_id:
    id_max = df.loc[df['Id'] == Id, 'Weight'].idxmax()
    nodes = nodes.append(df.loc[df.index == id_max], ignore_index=True)

    
display(nodes)

add_graph_attribute(graph, nodes, 'Degree')
add_graph_attribute(graph, nodes, 'modularity_class')
add_graph_attribute(graph, nodes, 'betweenesscentrality')    
add_graph_attribute(graph, nodes, 'Qid')
add_graph_attribute(graph, nodes, 'Topic')
add_graph_attribute(graph, nodes, 'Main topic')
add_graph_attribute(graph, nodes, 'Subtopic')
add_graph_attribute(graph, nodes, 'Weight')


nodes.to_csv(path + 'filled_nodes.csv', encoding='utf-8')

nx.write_graphml_lxml(graph, path + 'filled_graph.graphml')  


Unnamed: 0,Degree,Id,Label,Modularity Class,Probability,Qid,Topic,b,betweenesscentrality,g,...,x,y,Count,Total,Degree Ratio,Subtopic,Main topic,Weight,Total Weight,Ratio Weight
0,3.0,1668.0,Kanji,13.0,0.938134,Q82772,Culture.Linguistics,107.0,4400.000000,0.0,...,15.642170,5.252995,1.0,36.0,0.027778,Linguistics,Culture,2.814402,33.896623,0.083029
1,1.0,2175.0,Noël Mamère,16.0,0.492198,Q965109,History_and_Society.Politics_and_government,160.0,0.000000,96.0,...,19.000130,-14.803636,12.0,262.0,0.045802,Politics_and_government,History_and_Society,5.906378,5053.344695,0.001169
2,2.0,4813.0,Kirghizistan,7.0,0.122533,Q813,History_and_Society.Politics_and_government,47.0,1103.000000,42.0,...,-23.728676,-2.896586,3.0,267.0,0.011236,Politics_and_government,History_and_Society,0.735199,3072.677352,0.000239
3,6.0,5073.0,Paul VI,6.0,0.924152,Q16975,Culture.Philosophy_and_religion,0.0,1469.100000,129.0,...,3.840822,0.071065,19.0,315.0,0.060317,Philosophy_and_religion,Culture,105.353310,2003.183375,0.052593
4,8.0,5074.0,Jean XXIII,6.0,0.934405,Q23873,Culture.Philosophy_and_religion,0.0,16762.750000,129.0,...,-9.019573,-10.458631,19.0,315.0,0.060317,Philosophy_and_religion,Culture,142.029583,2003.183375,0.070902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,1.0,12232890.0,Kay Summersby,17.0,0.699264,Q6380290,History_and_Society.Military_and_warfare,233.0,0.000000,124.0,...,16.104740,-4.378316,10.0,372.0,0.026882,Military_and_warfare,History_and_Society,6.992645,2060.629614,0.003393
1100,75.0,12251484.0,Vaimalama Chaves,2.0,0.880807,Q59773587,Culture.Visual_arts.Fashion,183.0,3513.066425,168.0,...,16.471916,-2.976573,79.0,538.0,0.146840,Fashion,Culture,5218.782078,662827.013564,0.007874
1101,1.0,12251867.0,Tournée des quatre tremplins 2018-2019,8.0,0.991433,Q60148844,Culture.Sports,172.0,0.000000,126.0,...,22.461647,-12.265573,32.0,143.0,0.223776,Sports,Culture,31.725842,4375.221779,0.007251
1102,1.0,12267004.0,Lady Gaga Enigma,11.0,0.173298,Q56010926,Culture.Media.Media*,13.0,0.000000,95.0,...,-5.700338,9.479058,34.0,243.0,0.139918,Media*,Culture,5.892139,3582.865565,0.001645
