In [1]:
#Require library
import ast
import json
import networkx as nx
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

# def top_n_dt_matrix(docs, n=100):
#     # 문서를 문자열로 변환 (키워드를 밑줄로 연결)
#     docs_as_strings = ['_'.join(doc) for doc in docs]
    
#     # CountVectorizer 초기화 (binary=False로 설정하여 빈도 수 계산)
#     vectorizer = CountVectorizer(tokenizer=lambda x: x.split('_'), binary=False)
    
#     # DTM 생성
#     dtm = vectorizer.fit_transform(docs_as_strings)
    
#     # 각 단어의 빈도 수 계산
#     word_counts = np.sum(dtm, axis=0).A1  # A1은 np.array의 1차원 표현을 반환
#     print(word_counts)
    
#     # 빈도 수별로 단어 정렬 (내림차순)
#     top_indices = np.argsort(word_counts)[::-1][:n]  # 상위 n개 인덱스
    
#     # 상위 n개 단어에 해당하는 DTM 열 추출
#     top_n_dtm = dtm[:, top_indices]
    
#     # 상위 n개의 특성 이름
#     feature_names = vectorizer.get_feature_names_out()
#     top_n_feature_names = feature_names[top_indices]
    
#     return top_n_dtm.toarray(), top_n_feature_names

def dt_matrix(docs, min_df=1):
    # 문서를 문자열로 변환 (키워드를 밑줄로 연결)
    docs_as_strings = ['_'.join(doc) for doc in docs]
    
    # CountVectorizer 초기화 (min_df 설정 추가)
    vectorizer = CountVectorizer(binary=True, tokenizer=lambda x: x.split('_'), min_df=min_df)
    
    # DTM 생성
    dtm = vectorizer.fit_transform(docs_as_strings).toarray()
    
    return dtm, vectorizer.get_feature_names_out()

def co_matrix(matrix):
    return np.dot(matrix.T, matrix)

def create_graph_from_dtm(co_matrix, keywords, frequency):
    # Co-occurence Matrix를 사용하여 그래프 생성
    G = nx.from_numpy_array(co_matrix)
    
    # 각 노드에 빈도 값을 속성으로 설정
    freq_dict = {i: freq for i, freq in enumerate(frequency)}
    nx.set_node_attributes(G, freq_dict, "frequency")
    
    # 노드 레이블 설정
    label_mapping = {i: keyword for i, keyword in enumerate(keywords)}
    G = nx.relabel_nodes(G, label_mapping)
    
    return G

def normalize_by_diagonal(matrix):
    # 대각선 값 추출
    diagonal_values = np.diag(matrix).copy()
    
    # 0으로 나누는 것을 방지하기 위한 조건
    diagonal_values[diagonal_values == 0] = 1
    
    # 각 행을 대각선 값으로 나누기
    row_normalized = matrix / diagonal_values[:, None]
    
    # 각 열을 대각선 값으로 나누기
    column_normalized = row_normalized / diagonal_values

    return column_normalized, diagonal_values

# def to_list(input_):
#     if '[' in input_:
#         return ast.literal_eval(input_)
#     elif ',' in input_:
#         return input_.split(', ')



In [2]:
file_name = "data/Hospitality_KT_Post"
data = pd.read_csv(file_name + ".csv", index_col = 0)

data = data[['Keyphrase']]
data['Keyphrase'] = data['Keyphrase'].map(lambda x: ast.literal_eval(x))
data.reset_index(drop=True, inplace = True)
data

Unnamed: 0,Keyphrase
0,"[innisfree hotels, senior vice president of de..."
1,"[four seasons resort and residences whistler, ..."
2,"[auberge resorts collection, napa valley, hosp..."
3,"[executive vice president and general counsel,..."
4,"[hri hospitality, new orleans, hotel, nashvill..."
...,...
3247,"[uk employees feel undervalued at work, underv..."
3248,"[international tourism, pre-pandemic levels, a..."
3249,"[u.s. households, international vacations, mmg..."
3250,"[economic growth, unh, economic barometer, hou..."


In [3]:
# # 확인용
# '''
# print(len(set(sum(data['Keyphrase'].to_list(),[]))))

# matrix, feature_names = dt_matrix(data['Keyphrase'])

# coword_matrix = co_matrix(matrix)
# print(len(coword_matrix))
# '''

In [4]:
# %%time
# cut_off = int(len(data) * 0.005)
# matrix, feature_names = dt_matrix(data['Keyphrase'], cut_off)

# coword_matrix = co_matrix(matrix)
# coword_matrix

In [5]:
%%time
cut_off = int(len(data) * 0.005)
print(cut_off)
matrix, feature_names = dt_matrix(data['Keyphrase'], cut_off)

coword_matrix = co_matrix(matrix)
coword_matrix

16
CPU times: total: 37.6 s
Wall time: 37.8 s


array([[ 21,   0,   1, ...,   0,   0,   0],
       [  0,  36,   2, ...,   0,   0,   1],
       [  1,   2, 116, ...,   0,   0,   4],
       ...,
       [  0,   0,   0, ...,  35,  11,   1],
       [  0,   0,   0, ...,  11,  16,   1],
       [  0,   1,   4, ...,   1,   1,  49]], dtype=int64)

In [6]:
# matrix, feature_names = top_n_dt_matrix(data['Keyphrase'])

# coword_matrix = co_matrix(matrix)
# coword_matrix

In [7]:
normalized_matrix, diagonal_values = normalize_by_diagonal(coword_matrix.copy())
np.fill_diagonal(normalized_matrix, 0) 
normalized_matrix

array([[0.        , 0.        , 0.00041051, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00047893, ..., 0.        , 0.        ,
        0.00056689],
       [0.00041051, 0.00047893, 0.        , ..., 0.        , 0.        ,
        0.00070373],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.01964286,
        0.00058309],
       [0.        , 0.        , 0.        , ..., 0.01964286, 0.        ,
        0.00127551],
       [0.        , 0.00056689, 0.00070373, ..., 0.00058309, 0.00127551,
        0.        ]])

In [8]:
def min_max_scaling(array):
    min_val = np.min(array)
    max_val = np.max(array)
    
    # Min-Max scaling
    scaled_array = (array - min_val) / (max_val - min_val)
    
    return scaled_array

In [9]:
scaled_array = min_max_scaling(normalized_matrix)
scaled_array.shape

(993, 993)

In [90]:
from scipy.sparse.csgraph import connected_components
from scipy.sparse import csr_matrix

n = len(feature_names)

def remove_isolated_nodes(adj_matrix, features, diagonal_values, threshold):
    filtered_matrix = np.where(adj_matrix > threshold, adj_matrix, 0)
    isolated_nodes = np.where(~filtered_matrix.any(axis=1))[0]
    if len(isolated_nodes) == 0:
        return adj_matrix, features, diagonal_values
    new_matrix = np.delete(filtered_matrix, isolated_nodes, axis=0)
    new_matrix = np.delete(new_matrix, isolated_nodes, axis=1)
    new_features = [feature for i, feature in enumerate(features) if i not in isolated_nodes]
    new_diagonal_values = [value for i, value in enumerate(diagonal_values) if i not in isolated_nodes]
    return new_matrix, new_features, new_diagonal_values

def largest_subgraph(adj_matrix, features, diagonal_values):
    if adj_matrix.size == 0:  # 인접 행렬이 비었을 경우, 빈 리스트 반환
        return adj_matrix, features, diagonal_values

    graph_csr = csr_matrix(adj_matrix)
    n_components, labels = connected_components(csgraph=graph_csr, directed=False, return_labels=True)
    
    if n_components == 0:  # 연결된 컴포넌트가 없는 경우, 빈 리스트 반환
        return np.array([]), []

    component_sizes = np.bincount(labels)
    largest_component_label = component_sizes.argmax()
    nodes_in_largest_component = np.where(labels == largest_component_label)[0]
    largest_subgraph_matrix = adj_matrix[nodes_in_largest_component][:, nodes_in_largest_component]
    largest_subgraph_features = [features[i] for i in nodes_in_largest_component]
    largest_diagonal_values = [diagonal_values[i] for i in nodes_in_largest_component]
    return largest_subgraph_matrix, largest_subgraph_features, largest_diagonal_values


thresholds = np.arange(0, 1.01, 0.1)  # 0부터 1까지 0.1씩 증가
largest_subgraphs_results = []

for t in thresholds:
    new_matrix, new_features, new_diagonal_values = remove_isolated_nodes(scaled_array, feature_names, diagonal_values, t)
    largest_matrix, largest_features, largest_diagonal_values = largest_subgraph(new_matrix, new_features, new_diagonal_values)
    largest_subgraphs_results.append({
        'threshold': t,
        'remaining_nodes': len(largest_features),
        'removed_nodes': n - len(largest_features)
    })
largest_subgraphs_results  # 각 threshold 값에 따른 결과 출력

[{'threshold': 0.0, 'remaining_nodes': 993, 'removed_nodes': 0},
 {'threshold': 0.1, 'remaining_nodes': 908, 'removed_nodes': 85},
 {'threshold': 0.2, 'remaining_nodes': 199, 'removed_nodes': 794},
 {'threshold': 0.30000000000000004,
  'remaining_nodes': 47,
  'removed_nodes': 946},
 {'threshold': 0.4, 'remaining_nodes': 31, 'removed_nodes': 962},
 {'threshold': 0.5, 'remaining_nodes': 12, 'removed_nodes': 981},
 {'threshold': 0.6000000000000001, 'remaining_nodes': 7, 'removed_nodes': 986},
 {'threshold': 0.7000000000000001, 'remaining_nodes': 6, 'removed_nodes': 987},
 {'threshold': 0.8, 'remaining_nodes': 6, 'removed_nodes': 987},
 {'threshold': 0.9, 'remaining_nodes': 4, 'removed_nodes': 989},
 {'threshold': 1.0, 'remaining_nodes': 0, 'removed_nodes': 993}]

In [135]:
new_matrix, new_features, new_diagonal_values = remove_isolated_nodes(scaled_array, feature_names, diagonal_values, 0.20358)
largest_matrix, largest_features, largest_diagonal_values = largest_subgraph(new_matrix, new_features, new_diagonal_values)

In [136]:
len(largest_matrix)

145

In [137]:
# 0이 아닌 요소들을 필터링합니다.
non_zero_elements = largest_matrix[largest_matrix != 0]

# 0이 아닌 요소들 중 최소값을 찾습니다.
non_zero_min = np.min(non_zero_elements)
non_zero_min

0.20412959543394327

In [138]:
largest_matrix.shape

(145, 145)

In [85]:
G = create_graph_from_dtm(largest_matrix, largest_features, largest_diagonal_values)

In [140]:
save_path = file_name+'_'+str(1005)

In [141]:
nx.write_gexf(G, save_path+".gexf")

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NumpyEncoder, self).default(obj)


data = nx.node_link_data(G)
pos = nx.kamada_kawai_layout(G)

# with open(save_path+'.json', 'w') as outfile:
#     json.dump(data, outfile, cls=NumpyEncoder)

In [None]:
# Add the positions to the nodes in the data
for node in data['nodes']:
    node_id = node['id']
    if node_id in pos:
        node['position'] = pos[node_id]

In [None]:
with open(save_path+'.json', 'w') as outfile:
    json.dump(data, outfile, cls=NumpyEncoder)

In [None]:
data