In [1]:
import requests, json
import pandas as pd
from collections import Counter

In [53]:
input_coaut_key = 'folland'
url = 'https://scanr-api.enseignementsup-recherche.gouv.fr/api/v2/publications/search'
data = requests.post(url, json={
    'query': '"frédéric olland"',
    'pageSize': 500,
'searchFields': ['authors.fullName'],
    'sourceFields': ['title', 'id', 'authors', 'source.journalIssns', 'keywords', 'type']
}).json()

publis = []
for d in data['results']:
    new_elt = {
    'id': d['value']['id'],
    'title': d['value']['title']['default'],
    'authors': d['value']['authors'],
    'type': d['value']['type'],
    'issns': d['value'].get('source', {}).get('journalIssns', []),
    'keywords': d['value'].get('keywords', {}).get('default', [])
          }
    publis.append(new_elt)
json.dump(publis, open('publis.json', 'w'))

In [54]:
#publis = json.load(open('publis.json', 'r'))

In [55]:
for p in publis:
    p['nb_authors'] = len(p.get('authors', []))
    if 'these' not in p['id']:
        for a in p.get('authors', []):
            if 'person' in a:
                del a['person']
    else:
        for a in p.get('authors', []):
            if 'person' in a:
                a['id'] = a['person']['id']
                del a['person']
        
    
publis = sorted(publis, key=lambda p: p['nb_authors'], reverse=True) 

In [56]:
def normalize(x):
    x = x.lower().strip().replace('è', 'e').replace('é', 'e')
    x = x.replace(' ','').replace('-', '').replace('.', '').replace(' ', '')
    return x

In [57]:
df = pd.DataFrame(publis)
len(df)

3

In [58]:
def merge_clusters(proposed_clusters, target, entity_to_cluster, cluster_to_entities):
    print(f"merging {proposed_clusters} to {target}")
    for aut in entity_to_cluster:
        if entity_to_cluster[aut] in proposed_clusters:
            entity_to_cluster[aut] = target
    for c in proposed_clusters:
        if c != target:
            del cluster_to_entities[c]

In [59]:
def get_main_modality(x):
    cnt = Counter()
    for e in x:
        cnt[e] +=1
    top_2 = cnt.most_common(2)
    assert(len(top_2) == 2)
    if top_2[0][1] > top_2[1][1]:
        return top_2[0][0]
    return None

In [60]:
entity_to_cluster = {}
cluster_to_entities = {}

for p in publis:
    entity_linked = []
    for a in p.get('authors', []):
        if a.get('lastName') and a.get('firstName'):
            co_aut = normalize(a.get('firstName'))[0]+normalize(a.get('lastName'))
            a['coaut_key'] = co_aut
            if co_aut != input_coaut_key and len(co_aut)> 4:
                entity_linked.append(co_aut)
    for issn in p.get('issns', []):
        entity_linked.append(issn)
    for kw in p.get('keywords', []):
        entity_linked.append(normalize(kw))
    
    entity_linked = list(set(entity_linked))
    p['entity_linked'] = entity_linked
    if not entity_linked:
        continue
    
    current_cluster = None
    possible_clusters = []
    for entity in entity_linked:
        if entity in entity_to_cluster:
            possible_clusters.append(entity_to_cluster[entity])

       
    if len(set(possible_clusters))>1:
        current_cluster = get_main_modality(possible_clusters)
        #print(f"getting main for {proposed_clusters} : {know_cluster}")
        #know_cluster = merge_clusters(proposed_cluster, min(proposed_cluster))
    elif len(set(possible_clusters)) == 1:
        current_cluster = possible_clusters[0]
        

            
    if current_cluster is None:
        current_cluster = len(cluster_to_entities)   
        #print("new cluster with" + str(co_authors))
        cluster_to_entities[current_cluster] = []
        
    for entity in entity_linked:
        if entity not in entity_to_cluster:
            entity_to_clsuster[entity] = current_cluster
        #else:
        #    if entity_to_cluster[entity] != current_cluster:
        #        pass
                #print(f'{co_author_to_cluster[co_aut]}, {know_cluster}')

    cluster_to_entities[current_cluster] += entity_linked
    cluster_to_entities[current_cluster] = list(set(cluster_to_entities[current_cluster]))
                   

In [61]:
for p in publis:
    clusters = []
    for c in p['entity_linked']:
        clusters.append(entity_to_cluster[c])
    clusters = list(set(clusters))
    
    if len(clusters) == 1:
        p['cluster'] = clusters[0]
    else:
        p['cluster'] = None
    
    #if len(clusters)>1:
    #    print(clusters)
    #    #merge_clusters(clusters, min(clusters))
    #    pass
    #if clusters:
    #    p['cluster'] = ';'.join([str(c) for c in clusters])
    #else:
    #    p['cluster'] = 'no'

In [62]:
def get_id(cluster_id):
    known_ids = []
    publis_in_cluster = [p for p in publis if p['cluster'] == cluster_id]
    for p in publis_in_cluster:
        for a in p.get('authors'):
            if 'coaut_key' in a and a['coaut_key'] == input_coaut_key and 'id' in a:
                known_ids.append(a['id'])
    if len(set(known_ids))> 1:
        print(f"cluster {cluster_id} seems mixed up")
        return None
    if known_ids:
        return known_ids[0]

In [63]:
existing_clusters = list(cluster_to_entities.keys())
for cluster in existing_clusters:
    idref = get_id(cluster)
    if idref:
        merge_clusters([cluster], idref)

merging [0] to idref183975154


In [64]:
ix = 0
while ix < 10:
    ix +=1
    for p in publis:
        clusters = []
        for c in p['entity_linked']:
            clusters.append(entity_to_cluster[c])
        clusters = list(set(clusters))

        if len(clusters)>1:
            matching_ids = [c for c in clusters if 'idref' in str(c)]
            if len(matching_ids) == 1:
                merge_clusters(clusters, matching_ids[0])
            else:
                pass
        if clusters:
            p['cluster'] = ';'.join([str(c) for c in clusters])
        else:
            p['cluster'] = None

In [65]:
df_res = pd.DataFrame(publis)
df_res.cluster.value_counts()

idref183975154    2
1                 1
Name: cluster, dtype: int64

In [68]:
df_res[df_res.cluster=='1']#.sample(10)

Unnamed: 0,id,title,authors,type,issns,keywords,nb_authors,entity_linked,cluster
1,doi10.3917/redp.244.0537,Self-selection into export markets: Does produ...,"[{'role': 'author', 'firstName': 'Xi', 'lastNa...",journal-article,"[0373-2630, 2105-2883]",[],2,"[2105-2883, 0373-2630, xchen]",1


In [None]:
"tol de ".replace(" ", '')

In [None]:
df_res[df_res.cluster=='0'].authors.values[0]

In [None]:
df_encoded = pd.DataFrame(data_to_cluster).fillna(0)
cols_to_drop = []
for c in list(df_encoded.columns):
    if df_encoded[c].sum() <5:
        cols_to_drop.append(c)

df_encoded = df_encoded.drop(cols_to_drop, axis=1)
df_encoded

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(use_idf=True, stop_words='english', strip_accents='unicode', ngram_range=(1,2), max_features=100)
matrix = vec.fit_transform([p['title'] for p in publis])
df_title = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
cols_to_drop = []
for c in df_title.columns:
    if len(c)<5:
        cols_to_drop.append(c)
df_title = df_title.drop(cols_to_drop, axis=1)
df_title

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(use_idf=True, strip_accents='unicode', max_features=100)
matrix = vec.fit_transform([' '.join(p['co_authors']) for p in publis])
df_co = pd.DataFrame(matrix.toarray(), columns=vec.get_feature_names())
cols_to_drop = []
for c in df_co.columns:
    if len(c)<5:
        cols_to_drop.append(c)
df_co = df_co.drop(cols_to_drop, axis=1)
df_co

In [None]:
df_to_cluster = pd.concat([df_co, df_title], axis=1)
#df_to_cluster = df_co
clustered_cols = df_to_cluster.columns

#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#df_to_cluster = pd.DataFrame(scaler.fit_transform(df_to_cluster))
#df_to_cluster.columns = clustered_cols
df_to_cluster

In [None]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=5)#, distance_threshold=1)
clusters = clustering.fit_predict(df_to_cluster)
labels = pd.DataFrame(clusters)
labeled = pd.concat([df_to_cluster,labels],axis=1)
labeled = labeled.rename({0:'labels'},axis=1)
labeled.labels.value_counts()
#labeled

In [None]:
from sklearn.cluster import KMeans
clustering = KMeans(10)
clusters = clustering.fit_predict(df_to_cluster)
labels = pd.DataFrame(clusters)
labeled = pd.concat([df, df_to_cluster,labels],axis=1)
labeled = labeled.rename({0:'labels'},axis=1)
labeled.labels.value_counts()
#labeled

In [None]:
labeled[labeled.labels==1]

In [None]:
df

In [None]:
labeled

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np

from kmodes.kmodes import KModes
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df

In [None]:
len(list(df_title.columns))