In [1]:
# 라이브러리 불러오기
import spacy
import pandas as pd
import re
from spacy.lang.en import STOP_WORDS
import matplotlib.pyplot as plt

In [2]:
# 토크나이저 정의
nlp = spacy.load('en_core_web_sm')

In [3]:
# 불용어에 추가할 단어
custom_stop_words = ["sports", "purpose", "sport", "http", "club", "clubs", "study", "results"]

# 기존 불용어에 사용자 정의 불용어 추가
for word in custom_stop_words:
    STOP_WORDS.add(word)

# 다시 모델 로드 (불용어가 추가된 상태)
nlp = spacy.load("en_core_web_sm")

In [4]:
# 데이터 불러오기
df_kr = pd.read_excel(r"C:\Users\user\Desktop\토픽모델링_교수님\Data\DataFrame_jp_v1.xlsx")

In [5]:
# 데이터 concat
df = df_kr.copy()
len(df)

263

In [6]:
# "Abstract" 만 따로 추출
df_all = df.copy()
df_all_abstract = pd.DataFrame(df_all["Abstract"], columns=["Abstract"])
len(df_all_abstract)

263

In [7]:
# 특수문자 제거 함수
def remove_special_chars(text):
    # 정규표현식을 사용하여 특수문자와 제어문자 제거
    cleaned_text = re.sub(r'[^a-zA-Z가-힣\s]', ' ', text)
    return cleaned_text

In [8]:
# 특수문자 제거
df_all_abstract["Cleaned_Abstract"] = df_all_abstract["Abstract"].apply(remove_special_chars)
df_all_abstract

Unnamed: 0,Abstract,Cleaned_Abstract
0,The purpose of this literature review study wa...,The purpose of this literature review study wa...
1,This study focused on highly sustainable compr...,This study focused on highly sustainable compr...
2,The sustainable growth of comprehensive commun...,The sustainable growth of comprehensive commun...
3,This study attempts to clarify the factors of ...,This study attempts to clarify the factors of ...
4,The purpose of the present article was to desc...,The purpose of the present article was to desc...
...,...,...
258,As the management organization of the Extracur...,As the management organization of the Extracur...
259,The purpose of this paper is to : ⑴ identify t...,The purpose of this paper is to identify t...
260,The purpose of this study was to interpret the...,The purpose of this study was to interpret the...
261,The purpose of this paper is to conduct a lite...,The purpose of this paper is to conduct a lite...


In [9]:
# 명사 추출 함수(길이 3 이상)
def extract_nouns_and_remove_stopwords(text):
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ == 'NOUN' and len(token.text) >= 3 and token.text.lower() not in STOP_WORDS]
    return nouns

In [10]:
# 명사 추출 
abstract_noun = []

for text in df_all_abstract["Cleaned_Abstract"]:
    nouns = extract_nouns_and_remove_stopwords(text)
    abstract_noun.append(nouns)

len(abstract_noun)

263

In [11]:
# 데이터 프레임에 합치기
df_all["cleaned_abstract"] = df_all_abstract["Cleaned_Abstract"]
df_all["tokenized_abstract"] = abstract_noun

In [12]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Country             263 non-null    object
 1   Title               263 non-null    object
 2   Publised            263 non-null    int64 
 3   Journal             263 non-null    object
 4   Abstract            263 non-null    object
 5   cleaned_abstract    263 non-null    object
 6   tokenized_abstract  263 non-null    object
dtypes: int64(1), object(6)
memory usage: 14.5+ KB


In [13]:
# 빈도 저장
df_all.to_excel(r"C:/Users/user/Desktop/토픽모델링_교수님/Data/df_jp_final_stopwords.xlsx")

## 단어 별 빈도 확인

In [14]:
# 추출 단어 확인(10개 이상)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=extract_nouns_and_remove_stopwords , min_df=5)
dtm = cv.fit_transform(df_all["cleaned_abstract"])



In [15]:
# 단어 목록
cv.get_feature_names_out()

array(['abilities', 'ability', 'absence', 'achievement', 'acquisition',
       'act', 'actions', 'activities', 'activity', 'addition',
       'administration', 'adulthood', 'adults', 'advantages', 'age',
       'agency', 'ages', 'aim', 'ambiguity', 'analyses', 'analysis',
       'answers', 'approach', 'approaches', 'area', 'areas', 'article',
       'articles', 'aspects', 'association', 'associations', 'athlete',
       'athletes', 'attention', 'attitude', 'attitudes', 'attributes',
       'author', 'authorities', 'authority', 'autonomy', 'awareness',
       'background', 'base', 'baseball', 'basis', 'basketball',
       'behavior', 'behaviors', 'benefit', 'benefits', 'board', 'boards',
       'body', 'boys', 'budget', 'building', 'burden', 'business',
       'businesses', 'capacity', 'capital', 'care', 'career', 'case',
       'cases', 'categories', 'category', 'center', 'challenge',
       'challenges', 'change', 'changes', 'characteristics', 'charge',
       'child', 'childhood', 'c

In [16]:
# 단어 빈도 데이터 프레임
word_count = pd.DataFrame({
    'word': cv.get_feature_names_out(),
    'count': dtm.sum(axis=0).flat
})

word_count_desc = word_count.sort_values('count', ascending=False)
word_count_desc.head(15)

Unnamed: 0,word,count
92,community,296
7,activities,214
432,school,184
159,education,165
331,organizations,158
77,children,132
182,exercise,132
303,members,132
409,research,130
469,students,127


In [17]:
# 빈도 저장
word_count_desc.to_excel(r"C:/Users/user/Desktop/토픽모델링_교수님/Data/jp_extract_nouns_Frequency_stopwords.xlsx")

In [18]:
# 데이터프레임을 튜플 리스트로 변환
word_list = list(zip(word_count_desc['word'], word_count_desc['count']))

In [19]:
# 토크나이징한 최종 토큰 수 산출
all_words = []

for row in df_all["tokenized_abstract"]:
    all_words.extend(row)

all_words_df = pd.DataFrame(all_words, columns=["words"])

In [20]:
# 모든 단어 저장(중복 허용_워드클라우드용)
all_words_df.to_excel(r"C:/Users/user/Desktop/토픽모델링_교수님/Data/jp_extract_nouns_stopwords.xlsx")