In [418]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import itertools
from collections import Counter

from inflection import singularize 
import nltk
import difflib

from scipy.spatial.distance import euclidean, jaccard

In [10]:
excel_path_and_filename = "../../../Downloads/SMEs_Scopus_2013-2017.xlsx"
rawDF = pd.read_excel(excel_path_and_filename)
df = rawDF[['Author Keywords', 'Year', 'Abstract', 'Index Keywords']]

In [427]:
"""
주관적이지 않은 필터링, 아주 기본적인 filtering. 
split 부터 keyword list of list로 변환하여 리턴
"""
def basic_filter_Series(i_Series):
    r_Series = i_Series.copy().fillna("").apply(lambda s: s.strip().lower().split(";"))
    def replace_sp_chr(input_s):
        return "".join(map(lambda c: c if 'a'<=c and c<='z' else c if '0'<=c and c<='9'else " ", input_s)).strip()
    def remove_double_space(input_s):
        while "  " in input_s:
            input_s = input_s.replace("  ", " ")
        return input_s.strip()
    r_Series = r_Series.apply(
        lambda ks: list(map(
            lambda k: remove_double_space(replace_sp_chr(k)), ks)))
    
    all_kwd_set = set(itertools.chain.from_iterable(list(r_Series)))
    to_singular_dict = {}
    for kwd in all_kwd_set:
        singularized_kwd = singularize(kwd)
        if singularized_kwd !=kwd and singularized_kwd in all_kwd_set:
            to_singular_dict[kwd] = singularized_kwd
    """remove blank string"""
    r_Series = r_Series.apply(lambda ks:filter(lambda k: True if k!="" else False, ks))
    """singularize """
    r_Series = r_Series.apply(
        lambda ks: sorted(list(set(map(
            lambda k: to_singular_dict[k].strip() if k in to_singular_dict.keys() else k.strip(), ks
        )))))    
    return r_Series
"""
series로부터 그래프를 만들어주는 함수
"""
def make_graph_from_series(i_Series):
    rG = nx.Graph()
    rG.add_nodes_from(
        (n[0], {'weight':n[1]}) for n in Counter(itertools.chain.from_iterable(i_Series)).most_common())
    edges = []
    for x in i_Series:
        if len(x)!=0:
            edges += [(x[i], x[j]) for i in range(0, len(x)-1) for j in range(i+1, len(x))]
    rG.add_edges_from(
        [(e[0][0], e[0][1], {'weight':e[1]}) for e in Counter(edges).most_common()])
    return rG
"""
형태적으로 유사한 키워드를 찾아서 변환딕셔너리를 리턴. 
node의 weight가 above_node_w여야 하고, above_sim보다 유사도가 높아야 함
"""
def syntactical_simialrity_dict(i_Series, above_node_w=10, above_sim=0.9):
    kwd_counter = itertools.chain.from_iterable(i_Series)
    kwd_count_dct = {w:c for w, c in Counter(kwd_counter).most_common() if c >= above_node_w}
    print("for computation efficienty, cut down node got below weight, remaining node is {}".format(len(kwd_count_dct)))
    """
    """
    kwd_changed_dct = {}
    for w1 in sorted(kwd_count_dct.keys()):
        for w2 in sorted(kwd_count_dct.keys()):
            if w1 < w2 and w1[0]==w2[0] and " " in w1 and " " in w2:
                """중복을 피하고, 처음 캐릭터가 같고, 해당 단어가 복합어일 것 
                """
                sim_v = difflib.SequenceMatcher(None,w1, w2).ratio()
                if sim_v >= above_sim:
                    if kwd_count_dct[w1] >= kwd_count_dct[w2]:
                        kwd_changed_dct[w2]=w1
                    else:
                        kwd_changed_dct[w1]=w2
    """
    변환 딕셔너리를 non transitive하게, a==>b, b==>c 인 형태를 a==>c, b==>c 인 형태로 바꿔줌
    """
    def make_non_transitive(input_dct):
        print('solving transivity')
        non_transvitiy_kwd_dict = {}
        for k, v in input_dct.items():
            while v in input_dct.keys():
                v = input_dct[v]
            non_transvitiy_kwd_dict[k] = v
        return non_transvitiy_kwd_dict
    return make_non_transitive(kwd_changed_dct)

"""
입력받은 input_dct에 따라서 키워드를 변환하여 새로운 Series를 리턴
"""
def transform_by_dict(i_Series, input_dct):
    print()
    print("syntactically similar word를 변환해줍니다.")
    print('keyword set size: {}'.format(len(set(itertools.chain.from_iterable(i_Series)))))
    r_S = i_Series.apply(lambda ks: list(set([input_dct[k] if k in input_dct.keys() else k for k in ks])))
    print('keyword set size: {}'.format(len(set(itertools.chain.from_iterable(r_S)))))
    return r_S
"""
Series를 Counter로 변환한 다음, below_weight보다 큰 node만 남기고 Series를 리턴
"""
def drop_lower_n(i_Series, below_weight=10):
    print()
    print("series에서 weight가 {}와 같거나 작은 node를 삭제합니다.".format(below_weight))
    print('keyword set size: {}'.format(len(set(itertools.chain.from_iterable(i_Series)))))
    kwd_counter = {k:v for k, v in Counter(itertools.chain.from_iterable(i_Series)).most_common()}
    r_S = i_Series.apply(lambda ks: list(set([k for k in ks if kwd_counter[k] >= below_weight])))
    print('keyword set size: {}'.format(len(set(itertools.chain.from_iterable(r_S)))))
    return r_S
"""
불필요한 remove_node를 삭제함
"""
def remove_node_from_series(i_Series, remove_nodes):
    print("series에서 불필요한 node를 삭제합니다.")
    def func_remove_node(input_l):
        return [k for k in input_l if k not in remove_nodes]
    return i_Series.apply(func_remove_node)
"""
두 series로부터 bipartite graph를 만듬
"""
def make_bigraph_from_series(iS, jS):
    if len(iS)!=len(jS):
        print("different length of Series")
        return None
    rG = nx.Graph()
    edges = []
    def make_edges_from_bipartite_sets(setA, setB):
        if setA==[] or setB==[]:
            return []
        else:
            return [(n1, n2+"(i)") for n1 in setA for n2 in setB]
    for i in range(0, len(iS)):
        edges+=make_edges_from_bipartite_sets(iS.iloc()[i], jS.iloc()[i])
    rG.add_edges_from([(e[0][0], e[0][1], {'weight':e[1]}) for e in Counter(edges).most_common()])
    print('is bipartite: {}'.format(nx.is_bipartite(rG)))
    print('is connected: {}'.format(nx.is_connected(rG)))
    return rG


excel_path_and_filename = "../../../Downloads/SMEs_Scopus_2013-2017.xlsx"
rawDF = pd.read_excel(excel_path_and_filename)
df = rawDF[['Author Keywords', 'Year', 'Abstract', 'Index Keywords']].copy()

author_series = basic_filter_Series(df['Author Keywords'])
auth_syntactic_change_dict = syntactical_simialrity_dict(author_series, 10, 0.9)
author_series = transform_by_dict(author_series, auth_syntactic_change_dict)
author_series = drop_lower_n(author_series, 8)

ind_series = basic_filter_Series(df['Index Keywords'])
ind_syntactic_change_dict = syntactical_simialrity_dict(ind_series, 10, 0.9)
ind_series = transform_by_dict(ind_series, ind_syntactic_change_dict)
ind_series = drop_lower_n(ind_series, 15)

""" adj matrix
"""
"""
remove node because it is useless
이 부분은 clustering의 결과로 나온 cluster 중에서 분석하려고 하는 대상과 거리가 있는 키워드 묶음을 선정하여 기존 series에서 삭제하였다. 
"""
author_series = remove_node_from_series(author_series, 
['shape memory', 'phase transformation', 'thermomechanical treatment', 'sma', 'stimuli sensitive polymers', 'actuator', 'shape memory polymer', 'superelasticity', 'martensitic transformation', 'microstructure', 'niti', 'nitinol', 'shape memory alloy', 'shape memory effect']
+['fuzzy logic controller', 'power quality', 'energy management', 'battery', 'smart grid', 'energy storage', 'stability', 'photovoltaic', 'particle swarm optimization', 'power system', 'automatic generation control', 'transient stability', 'load frequency control', 'renewable energy', 'genetic algorithm', 'ac loss', 'high temperature superconductor', 'superconducting magnet', 'microgrid', 'power fluctuation', 'power']
+['superconducting magnetic energy storage smes', 'optimization']
)

authG = make_graph_from_series(author_series)
auth_adj_df = pd.DataFrame(
    nx.adjacency_matrix(authG).toarray(), index=[n for n in authG.nodes()], columns=[n for n in authG.nodes()]
)

"""bipartite graph and bipartite adjacency matrix 
"""
biG = make_bigraph_from_series(author_series, ind_series)
row_order = nx.bipartite.sets(biG)[0]
col_order = nx.bipartite.sets(biG)[1]
bi_df = pd.DataFrame(
    nx.bipartite.biadjacency_matrix(biG, row_order=row_order).toarray(),
    index = row_order, columns = col_order
)

""" adj matrix
"""
"""clustering
"""
n_clusters = 20

AGGmodel = cluster.AgglomerativeClustering(n_clusters=n_clusters)
bi_cluster_df = pd.DataFrame(
    # weight를 무시하고, 0, 1의 단일 연결로 하니까 더 잘되서 변환해서 거리를 잼
    {'kwd': bi_df.index, 'cluster': AGGmodel.fit_predict(bi_df.apply(lambda col: [1 if x>=1 else 0 for x in col]))} 
)

for i in range(0, n_clusters):
    kwd_in_cluster = list(bi_cluster_df[bi_cluster_df['cluster']==i]['kwd'])
    """너무 많지도 적지도 않은 클러스터를 확인
    """
    if len(kwd_in_cluster)>=2 and len(kwd_in_cluster)<50:
        print("size of cluster {}: {}".format(i, len(kwd_in_cluster)))
        print(kwd_in_cluster)

print("complete")

for computation efficienty, cut down node got below weight, remaining node is 583
solving transivity

syntactically similar word를 변환해줍니다.
keyword set size: 22883
keyword set size: 22862

series에서 weight가 8와 같거나 작은 node를 삭제합니다.
keyword set size: 22862
keyword set size: 746
for computation efficienty, cut down node got below weight, remaining node is 1357
solving transivity

syntactically similar word를 변환해줍니다.
keyword set size: 29149
keyword set size: 29127

series에서 weight가 15와 같거나 작은 node를 삭제합니다.
keyword set size: 29127
keyword set size: 821
series에서 불필요한 node를 삭제합니다.
is bipartite: True
is connected: True
size of cluster 0: 4
['entrepreneurship', 'small business', 'performance', 'sustainability']
size of cluster 1: 12
['risk management', 'management', 'supply chain management', 'malaysia', 'development', 'project management', 'developing country', 'implementation', 'risk', 'supply chain', 'risk assessment', 'simulation']
size of cluster 2: 8
['technology', 'internet', 'adoption', 'soci