- 지난번에 코딩할때, 제대로 모듈화도 하지 않고, 너무 막 코딩하여 이 노트북에서는 좀 정리를 해놓으려고 합니다. 
    - 아직 추가로 하지 못한 부분이 많아서, 해야할게 많거든요. 아무튼. 
    

In [38]:
"""
required library 
"""
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
from collections import Counter
from inflection import singularize 
from textblob import TextBlob
from sklearn.preprocessing import MinMaxScaler

"""
desc: centrality를 계산하는 함수들입니다. 
input: graph(undirected)
output: dictionary
"""
def return_weighted_degree_centrality(input_g, normalized=True):
    w_d_centrality = {n:0.0 for n in input_g.nodes()}
    for u, v, d in input_g.edges(data=True):
        w_d_centrality[u]+=d['weight']
        w_d_centrality[v]+=d['weight']
    if normalized==True:
        weighted_sum = sum(w_d_centrality.values())
        return {k:v/weighted_sum for k, v in w_d_centrality.items()}
    else:
        return w_d_centrality
def return_closeness_centrality(input_g):
    new_g_with_distance = input_g.copy()
    for u,v,d in new_g_with_distance.edges(data=True):
        if 'distance' not in d:
            d['distance'] = 1.0/d['weight']
    return nx.closeness_centrality(new_g_with_distance, distance='distance')
def return_betweenness_centrality(input_g):
    return nx.betweenness_centrality(input_g, weight='weight')
def return_pagerank(input_g):
    return nx.pagerank(input_g, weight='weight')

"""
desc: pd.DataFrame의 특정 칼럼(each element is list)의 모든 리스트를 합친 다음 Counter => pd.DataFrame
input: pd.DataFrame
output: pd.DataFrame
"""
def total_count_with(input_df, column_name='Author Keywords'):
    # 'Author Keywords' or 'Noun Phrases'
    r = itertools.chain.from_iterable(input_df[column_name])
    r = Counter(r).most_common()
    return pd.DataFrame(r, columns=[column_name, 'count'])
"""
desc: 전체적으로 listfmf filtering하여 리턴한다. 
input: pd.DataFrame(each element is list of kwd)
output: pd.DataFrame
"""
def filtering_auth_kwds(input_df,column_name='Author Keywords', above_n=3):
    """
    개별 node가 전체에서 1번 밖에 등장하지 않는 경우도 많은데, 이를 모두 고려해서 분석을 하면, 효율적이지 못한 계산이 된다. 
    따라서, 빈도가 일정 이상을 넘는 경우에 대해서만 고려하여 new_df를 수정하는 것이 필요하다. 
    """
    # singularize 
    input_df[column_name] = input_df[column_name].apply(lambda ks: [singularize(k).strip().lower() for k in ks])
    # drop low count kwd 
    filtered_kwds = total_count(input_df, column_name=column_name)
    filtered_kwds = set(filtered_kwds[filtered_kwds['count']>=above_n][column_name])
    input_df[column_name] = input_df[column_name].apply(lambda ks: list(filter(lambda k: True if k in filtered_kwds else False, ks)))
    # edge를 만들때 중복을 방지하기 위해서 sorting해둔다. 
    input_df[column_name] = input_df[column_name].apply(lambda l: sorted(list(set(l))))
    """
    word embeddingd 등 다른 데이터 전처리가 필요하다면 여기서 처리하는 것이 좋음. 
    """
    return input_df# 사실 굳이 return을 쓸 필요가 없음. 이미 내부에서 다 바꿔줌. 
"""
desc: 연도별로 상위 빈도 키워드를 50개씩 칼럼별로 보여줌
input: pd.DataFrame
output: pd.DataFrame(column 이름은 각 연도)
"""
def yearly_count_rank(input_df, column_name='Author Keywords', until_rank_n=50):
    r_dict = {}
    for year, year_df in input_df.groupby('Year'):
        r_dict[year] = list(total_count(year_df, column_name=column_name)[column_name])[:until_rank_n]
        if len(r_dict[year])<until_rank_n:
            for i in range(0, until_rank_n - len(r_dict[year])):
                r_dict[year].append("")
    return pd.DataFrame(r_dict)
"""
desc: df로부터 그래프를 생성하는 함수
input: pd.DataFrame
output: nx.Graph
"""
def make_graph(input_df, column_name='Author Keywords'):
    # make edges: edge가 중복으로 생기지 않게 하려면, 
    def make_edges_from_lst(lst):
        if len(lst)>1:
            return [(lst[i], lst[j]) for i in range(0, len(lst)-1) for j in range(i+1, len(lst))]
        else:
            return []
    nodes = total_count(input_df)
    new_nodes = []
    for i in range(0, len(nodes)):
        name = nodes[column_name].iloc()[i]
        w = nodes['count'].iloc()[i]
        new_nodes.append( (name, {'weight':w}) )
    nodes = new_nodes
    edges = itertools.chain.from_iterable(input_df[column_name].apply(make_edges_from_lst))
    edges = ((uv[0], uv[1], w) for uv, w in Counter(edges).most_common())
    G = nx.Graph()
    G.add_nodes_from(nodes)
    G.add_weighted_edges_from(edges)
    # graph에 대한 데이터 필터링이 필요할 수 있는데. 여기서. 
    return G
"""
일종의 main 함수입니다. 
raw_df를 넘기는데, 가능하면 해당 argument에서 복사해서 넘겨주는 게 좋을 것 같습니다. 혹시나 싶어서요. 
"""
def scopus_analysis(raw_df, outExcelname, outPPTname):
    r_df = raw_df.dropna()
    print("drop {} row".format(len(raw_df) - len(r_df)))
    r_df['Author Keywords'] = r_df['Author Keywords'].apply(lambda s: s.strip().split(";"))
    r_df['Noun Phrases'] = r_df['Abstract'].apply(lambda s: TextBlob(s).noun_phrases)
    print(r_df.head())
    r_df = filtering_auth_kwds(r_df, 'Author Keywords', 1)
    r_df = filtering_auth_kwds(r_df, 'Noun Phrases', 1)
    
    """
    여기서부터는 엑셀에 시트별로 내용을 넣어주는 부분 
    """
    writer = pd.ExcelWriter(outExcelname)
    total_count(r_df, column_name='Author Keywords').to_excel(writer, '01. 전체 저자 키워드 빈도 상위 키워드')
    total_count(r_df, column_name='Noun Phrases').to_excel(writer, '02. 전체 noun phrase 빈도 상위')
    yearly_count_rank(r_df, column_name='Author Keywords').to_excel(writer, '03. 연도별 저자 키워드 순위 변화')
    yearly_count_rank(r_df, column_name='Noun Phrases').to_excel(writer, '04. 연도별 noun phrase 순위 변화')
    print("빈도 시트 완료")
    
    kwdG = make_graph(r_df, 'Author Keywords')
    def make_centrality_df(inputG, cent_func):
        deg_cent = cent_func(inputG)
        deg_cent = sorted([(k, v) for k, v in deg_cent.items()], key=lambda x: x[1], reverse=True)
        return pd.DataFrame(deg_cent, columns=['kwd', 'centrality'])
    make_centrality_df(kwdG, return_weighted_degree_centrality).to_excel(writer, '05. 키워드 전체 네트워크 w.deg cent')
    make_centrality_df(kwdG, return_closeness_centrality).to_excel(writer, '06. 키워드 전체 네트워크 close cent')
    make_centrality_df(kwdG, return_betweenness_centrality).to_excel(writer, '07. 키워드 전체 네트워크 bet. cent')
        
    writer.save()
    

excel_path_and_filename = "../../../Downloads/SMEs_Scopus_2013-2017.xlsx"
df = pd.read_excel(excel_path_and_filename)
df = df[['Author Keywords', 'Year', 'Abstract']]

scopus_analysis(df.iloc()[:200], 'simple_report_for_SME.xlsx', 'simple_figure_for_SME.pptx')
print("complete")

drop 45 row


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


                                     Author Keywords  Year  \
0  [CSFs,  Factor analysis,  Indian manufacturing...  2017   
1  [Energy recovery,  Muzzle arc,  Pulsed supply,...  2017   
3  [internationalization,  market research,  orga...  2017   
5  [Contractors,  Entrepreneurship,  Indonesia,  ...  2017   
6  [BPGM-SME,  Improved UKF,  Multi-target tracki...  2017   

                                            Abstract  \
0  This research paper is to evaluate and present...   
1  According to the application requirements of t...   
3  This study shows a low-priced information coll...   
5  The success of entrepreneurship as an importan...   
6  The problem of multi-sensor detecting and mult...   

                                        Noun Phrases  
0  [research paper, online survey, medium enterpr...  
1  [according, application requirements, novel ci...  
3  [study shows, information collection process, ...  
5  [important driving force, business success, co...  
6  [firstly, do

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


빈도 시트 완료
complete
