In [185]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import itertools
from collections import Counter

from inflection import singularize 
import nltk
import difflib

In [10]:
excel_path_and_filename = "../../../Downloads/SMEs_Scopus_2013-2017.xlsx"
rawDF = pd.read_excel(excel_path_and_filename)
df = rawDF[['Author Keywords', 'Year', 'Abstract', 'Index Keywords']]

In [248]:
def basic_filter_Series(i_Series):
    """
    주관적이지 않은 필터링, 아주 기본적인 filtering. 
    split 부터 keyword list of list로 변환하여 리턴
    """
    r_Series = i_Series.copy().fillna("").apply(lambda s: s.strip().lower().split(";"))
    def replace_sp_chr(input_s):
        return "".join(map(lambda c: c if 'a'<=c and c<='z' else c if '0'<=c and c<='9'else " ", input_s)).strip()
    def remove_double_space(input_s):
        while "  " in input_s:
            input_s = input_s.replace("  ", " ")
        return input_s.strip()
    r_Series = r_Series.apply(
        lambda ks: list(map(
            lambda k: remove_double_space(replace_sp_chr(k)), ks)))
    
    all_kwd_set = set(itertools.chain.from_iterable(list(r_Series)))
    to_singular_dict = {}
    for kwd in all_kwd_set:
        singularized_kwd = singularize(kwd)
        if singularized_kwd !=kwd and singularized_kwd in all_kwd_set:
            to_singular_dict[kwd] = singularized_kwd
    """remove blank string"""
    r_Series = r_Series.apply(lambda ks:filter(lambda k: True if k!="" else False, ks))
    """singularize """
    r_Series = r_Series.apply(
        lambda ks: sorted(list(set(map(
            lambda k: to_singular_dict[k].strip() if k in to_singular_dict.keys() else k.strip(), ks
        )))))    
    return r_Series
def make_graph_from_series(i_Series):
    rG = nx.Graph()
    rG.add_nodes_from(
        (n[0], {'weight':n[1]}) for n in Counter(itertools.chain.from_iterable(i_Series)).most_common())
    edges = []
    for x in i_Series:
        if len(x)!=0:
            edges += [(x[i], x[j]) for i in range(0, len(x)-1) for j in range(i+1, len(x))]
    rG.add_edges_from(
        [(e[0][0], e[0][1], {'weight':e[1]}) for e in Counter(edges).most_common()])
    return rG
def syntactical_simialrity_dict(i_Series, above_node_w=10, above_sim=0.9):
    kwd_counter = itertools.chain.from_iterable(i_Series)
    kwd_count_dct = {w:c for w, c in Counter(kwd_counter).most_common() if c >= above_node_w}
    print("for computation efficienty, cut down node got below weight, remaining node is {}".format(len(kwd_count_dct)))
    """
    """
    kwd_changed_dct = {}
    for w1 in sorted(kwd_count_dct.keys()):
        for w2 in sorted(kwd_count_dct.keys()):
            if w1 < w2 and w1[0]==w2[0] and " " in w1 and " " in w2:
                """중복을 피하고, 처음 캐릭터가 같고, 해당 단어가 복합어일 것 
                """
                sim_v = difflib.SequenceMatcher(None,w1, w2).ratio()
                if sim_v >= above_sim:
                    if kwd_count_dct[w1] >= kwd_count_dct[w2]:
                        kwd_changed_dct[w2]=w1
                    else:
                        kwd_changed_dct[w1]=w2
    def make_non_transitive(input_dct):
        print('solving transivity')
        non_transvitiy_kwd_dict = {}
        for k, v in input_dct.items():
            while v in input_dct.keys():
                v = input_dct[v]
            non_transvitiy_kwd_dict[k] = v
        return non_transvitiy_kwd_dict
    return make_non_transitive(kwd_changed_dct)

def transform_by_dict(i_Series, input_dct):
    print('keyword set size: {}'.format(len(set(itertools.chain.from_iterable(i_Series)))))
    r_S = i_Series.apply(lambda ks: list(set([input_dct[k] if k in input_dct.keys() else k for k in ks])))
    print('keyword set size: {}'.format(len(set(itertools.chain.from_iterable(r_S)))))
    return r_S
temp = basic_filter_Series(rawDF['Author Keywords'])
import time
start = time.time()
temp = transform_by_dict(temp, syntactical_simialrity_dict(temp, 6, 0.9))
end = time.time()
print(end-start)
temp.head()
g = make_graph_from_series(temp)

print("complete")

for computation efficienty, cut down node got below weight, remaining node is 1077
solving transivity
keyword set size: 22883
keyword set size: 22842
2.0763649940490723
complete


for computation efficienty, cut down node got below weight, remaining node is 1077
solving transivity
keyword set size: 22883
keyword set size: 22842


495

In [277]:
temp = basic_filter_Series(rawDF['Author Keywords'])
temp = transform_by_dict(temp, syntactical_simialrity_dict(temp, 6, 0.9))
g = make_graph_from_series(temp)
"""node가 너무 많으면, 계산이 어려워져서 일정 이상의 weight를 가진 node만이 의미있는 node라고 가정했습니다. 
"""
g.remove_nodes_from( [n[0] for n in g.copy().nodes(data=True) if n[1]['weight']<=8] )

temp_df = pd.DataFrame(
    nx.adjacency_matrix(g).toarray(), index=[n for n in g.nodes()], columns=[n for n in g.nodes()]
)
print("the dimension of dataframe is {}".format(temp_df.shape[0]))

###

from scipy.spatial.distance import euclidean, jaccard
def make_close_pair_lst(adj_df, dist_func):
    r_lst = []
    for i in range(0, len(adj_df)-1):
        for j in range(i+1, len(adj_df)):
            r_lst.append(
                (adj_df.index[i], adj_df.index[j], dist_func(adj_df.iloc()[i], adj_df.iloc()[j]))
            )
    return sorted(r_lst, key=lambda x: x[2])
        
print("top 10 closest pair: euclidean space::")
"""euclidean distance의 경우는 표준화가 필요함. 
"""
for p in make_close_pair_lst(temp_df.apply(lambda col: (col)/(col.max())).fillna(0), euclidean)[:10]:
    print(p)
print("top 10 closest pair: jaccard space::")
for p in make_close_pair_lst(temp_df, jaccard)[:10]:
    print(p)

for computation efficienty, cut down node got below weight, remaining node is 1077
solving transivity
keyword set size: 22883
keyword set size: 22842
the dimension of dataframe is 639
top 10 closest pair: euclidean space::
('binary mixture', 'metallacarboranes', 0.0)
('binary mixture', 'flotation', 0.0)
('metallacarboranes', 'flotation', 0.0)
('binary mixture', 'carbapenemase', 0.0125)
('carbapenemase', 'metallacarboranes', 0.0125)
('carbapenemase', 'flotation', 0.0125)
('transient stability', 'battery', 0.29829779253058464)
('transient stability', 'carbapenemase', 0.3305048960211117)
('transient stability', 'binary mixture', 0.3307411923149668)
('transient stability', 'metallacarboranes', 0.3307411923149668)
top 10 closest pair: jaccard space::


  np.double(np.bitwise_or(u != 0, v != 0).sum()))


('exploration', 'exploitation', 0.29629629629629628)
('pecking order theory', 'trade off theory', 0.45454545454545453)
('microstructure', 'niti', 0.63636363636363635)
('transient stability', 'power quality', 0.66666666666666663)
('power quality', 'ac loss', 0.66666666666666663)
('superelasticity', 'actuator', 0.66666666666666663)
('superelasticity', 'nitinol', 0.66666666666666663)
('industrial symbiosis', 'circular economy', 0.66666666666666663)
('usability', 'requirements engineering', 0.69230769230769229)
('ahp', 'analytic hierarchy process', 0.70588235294117652)


In [None]:
temp_auth = basic_filter_Series(rawDF['Author Keywords'])
temp_auth = transform_by_dict(temp_auth, syntactical_simialrity_dict(temp_auth, 6, 0.9))

temp_ind = basic_filter_Series(rawDF['Index Keywords'])
temp_ind = transform_by_dict(temp_ind, syntactical_simialrity_dict(temp_ind, 6, 0.9))

temp_df = pd.DataFrame({'Author Keywrods':temp_auth, 'Index Keywords':temp_ind})
temp_df.head()
#temp = transform_by_dict(temp, syntactical_simialrity_dict(temp, 6, 0.9))
def make_bigraph_from_series(iS, jS):
    if len(iS)!=len(jS):
        print("different length of Series")
        return None
    rG = nx.Graph()
    edges = []
    def make_edges_from_bipartite_sets(setA, setB):
        if len(setA)==[] or len(setB)==[]:
            return []
        else:
            return [(n1, n2+"(i)") for n1 in setA for n2 in setB]
    for i in range(0, len(iS)):
        if i%100==0:
            print("complete {}".format(i))
        edges+=make_edges_from_bipartite_sets(iS.iloc()[i], iS.iloc()[j])
    rG.add_edges_from([(e[0][0], e[0][1], {'weight':e[1]}) for e in Counter(edges).most_common()])
    return rG
biG = make_bigraph_from_series(temp_auth, temp_ind)

for computation efficienty, cut down node got below weight, remaining node is 1077
solving transivity
keyword set size: 22883
keyword set size: 22842


In [129]:
"""
가장 중요한 node를 삭제하고, 몇 가지의 그룹으로 나누어지는지 확인해보자. 
일종의 클러스터링 분석이라고 생각함. 만약 이 아이가 없어도 커넥션이 유지되면 이 그룹을 나름 긴밀한 구조라고 평가할 수도 있음 
하나의 키워드만 삭제해도, 커넥션이 유지되지 못함.
이런 식으로 빈도가 가장 높은 키워드(이걸 빈도로 하는게 적합한지는 모르겠는데, degree centrality로 하는게 더 적합할 수도 있고 
하나를 지워도 여전히 네트워크에 아무런 타격이 없는 경우가, 해당 네트워크가 어느 정도 안정화되어 있다 라고 평가할 수 있지 않을까 싶음. 
그러나, 이걸 이론적으로 증명하려면 조금 어렵다는 생각이 들기는 하는ㄷ. 
"""
print(len(g.nodes(data=True)))
g1 = g.copy()
for i in range(0, 100):
    before = nx.is_connected(g1)
    removed_k = max(nx.degree_centrality(g1).items(), key=lambda k: k[1])
    g1.remove_node(removed_k[0])
    after = nx.is_connected(g1)
    if nx.is_connected(g1)==True:
        print("after try {}: remove top degree centrality doesn't matter".format(i))
        break
    else:
        g1 = max(nx.connected_component_subgraphs(g1), key=lambda subG: len(subG.nodes()))
print(nx.is_connected(g1))
print(len(g1.nodes(data=True)))

22883
after try 51: remove top degree centrality doesn't matter
True
19363


In [184]:
def make_graph_dense(i_G):
    """graph에서 degree centrality가 가장 높은 node를 삭제해도, graph는 여전히 connected여야 함
    그럴때마다 잘려나가는 node는 모두 삭제함. 
    """
    tempG = i_G.copy()
    tempG = max(nx.connected_component_subgraphs(tempG), key=lambda subG: len(subG.nodes()))
    removed_nodes = set(i_G.nodes())
    for i in range(0, 10):
        top_deg_node = max(nx.degree_centrality(tempG).items(), key=lambda k: k[1])[0]
        print("try {}: remove top degree node: {}".format(i, top_deg_node))
        removed_nodes.remove(top_deg_node)# 이 node는 마지막에 다시 넣어줘야 함.
        tempG.remove_nodes_from([top_deg_node])
        if nx.is_connected(tempG)==True:
            print("after try {}: remove top degree centrality doesn't matter".format(i))
            break
        else:
            tempG = max(nx.connected_component_subgraphs(tempG), key=lambda subG: len(subG.nodes()))
    """만약 100번을 넘어도 커넥션이 유지되지 않는다면 이를 메세지로 알려주는 것이 필요한데."""
    return tempG # 이걸 가지고 해보자 한번. 
    removed_nodes.difference_update(set(tempG.nodes()))
    rG = i_G.copy()
    rG.remove_nodes_from(removed_nodes)
    """단 이렇게 변형했을때, node의 weight attribute는 그대로 유지됨. 뭐 근데 별 의미없을 수 잇지만."""
    return rG
denseG = make_graph_dense(g)

Exception ignored in: 'zmq.backend.cython.message.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/checkrc.pxd", line 12, in zmq.backend.cython.checkrc._check_rc (zmq/backend/cython/message.c:4294)
KeyboardInterrupt

try 0: remove top degree node: sme
try 1: remove top degree node: small and medium sized enterprise
try 2: remove top degree node: innovation
try 3: remove top degree node: small and medium enterprise
try 4: remove top degree node: entrepreneurship
try 5: remove top degree node: small and medium sized enterprises sme
try 6: remove top degree node: performance
try 7: remove top degree node: small to medium sized enterprise
try 8: remove top degree node: small and medium enterprises sme
try 9: remove top degree node: cloud computing





In [None]:
denseG.nodes(data=True)

In [179]:
tempG = denseG.copy()
tempG.remove_nodes_from(
    [n[0] for n in tempG.nodes(data=True) if n[1]['weight'] < 50]
)
print(len(tempG.nodes()))
adj_df = pd.DataFrame(nx.adjacency_matrix(tempG).toarray(), index=tempG.nodes(), columns=tempG.nodes())


from scipy.spatial.distance import euclidean

dist_lst = []
for i in range(0, len(adj_df)-1):
    for j in range(i+1, len(adj_df)):
        dist_lst.append(
            (adj_df.index[i], adj_df.index[j], euclidean(adj_df.iloc()[i], adj_df.iloc()[j]))
        )
sorted(dist_lst, key=lambda x: x[2], reverse=True)[:20]

59


[('sme', 'innovation', 412.37846694511097),
 ('sme', 'superconducting magnetic energy storage', 397.28453279733907),
 ('sme', 'superconducting magnetic energy storage smes', 397.25306795542815),
 ('sme', 'shape memory effect', 397.25306795542815),
 ('sme', 'south africa', 393.39166234174309),
 ('sme', 'small medium enterprise', 392.96437497564585),
 ('sme', 'risk', 392.44744871128927),
 ('sme', 'sustainable development', 392.30982654019766),
 ('sme', 'trust', 391.97066216746373),
 ('sme', 'erp', 391.92218615434365),
 ('sme', 'survey', 391.59545451907383),
 ('sme', 'risk management', 391.22116507162542),
 ('sme', 'productivity', 391.18154353190027),
 ('sme', 'social media', 390.0487149062281),
 ('sme', 'indium', 389.61647809095547),
 ('sme', 'intellectual capital', 389.35459416834931),
 ('sme', 'export', 389.31606696872916),
 ('sme', 'e commerce', 389.28267364474368),
 ('sme', 'barrier', 389.06683230519661),
 ('sme', 'internationalisation', 388.93187064060459)]

In [266]:
pd.DataFrame({"a":[i for i in range(0, 10)]}).shape[0]

10