In [1]:
# 라이브러리 불러오기
import spacy
import pandas as pd
import re
from spacy.lang.en import STOP_WORDS
import matplotlib.pyplot as plt

In [2]:
# 토크나이저 정의
nlp = spacy.load('en_core_web_sm')

In [3]:
# 불용어에 추가할 단어
custom_stop_words = ["sports", "purpose", "sport", "http", "club", "clubs", "study", "results"]

# 기존 불용어에 사용자 정의 불용어 추가
for word in custom_stop_words:
    STOP_WORDS.add(word)

# 다시 모델 로드 (불용어가 추가된 상태)
nlp = spacy.load("en_core_web_sm")

In [4]:
# 데이터 불러오기
df_kr = pd.read_excel(r"C:\Users\user\Desktop\토픽모델링_교수님\Data\DataFrame_kr_v1.xlsx")

In [5]:
# 데이터 concat
df = df_kr.copy()
len(df)

312

In [6]:
# "Abstract" 만 따로 추출
df_all = df.copy()
df_all_abstract = pd.DataFrame(df_all["Abstract"], columns=["Abstract"])
len(df_all_abstract)

312

In [7]:
# 특수문자 제거 함수
def remove_special_chars(text):
    # 정규표현식을 사용하여 특수문자와 제어문자 제거
    cleaned_text = re.sub(r'[^a-zA-Z가-힣\s]', ' ', text)
    return cleaned_text

In [8]:
# 특수문자 제거
df_all_abstract["Cleaned_Abstract"] = df_all_abstract["Abstract"].apply(remove_special_chars)
df_all_abstract

Unnamed: 0,Abstract,Cleaned_Abstract
0,The Relationship between the Subjective Wellbe...,The Relationship between the Subjective Wellbe...
1,The purpose this study was to find out the bui...,The purpose this study was to find out the bui...
2,The purpose of this study was to explore the v...,The purpose of this study was to explore the v...
3,The purpose of this study is to analyse presen...,The purpose of this study is to analyse presen...
4,This study has the objective of retrospectivel...,This study has the objective of retrospectivel...
...,...,...
307,Pupose : The purpose of this study was to exam...,Pupose The purpose of this study was to exam...
308,The french administrative justice system ensur...,The french administrative justice system ensur...
309,A study on training/certification program and ...,A study on training certification program and ...
310,This study has compared and analyzed the prese...,This study has compared and analyzed the prese...


In [9]:
# 명사 추출 함수(길이 3 이상)
def extract_nouns_and_remove_stopwords(text):
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ == 'NOUN' and len(token.text) >= 3 and token.text.lower() not in STOP_WORDS]
    return nouns

In [10]:
# 명사 추출 
abstract_noun = []

for text in df_all_abstract["Cleaned_Abstract"]:
    nouns = extract_nouns_and_remove_stopwords(text)
    abstract_noun.append(nouns)

len(abstract_noun)

312

In [11]:
# 데이터 프레임에 합치기
df_all["cleaned_abstract"] = df_all_abstract["Cleaned_Abstract"]
df_all["tokenized_abstract"] = abstract_noun

In [12]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Country             312 non-null    object
 1   Title               312 non-null    object
 2   Journal             312 non-null    object
 3   Publised            312 non-null    int64 
 4   Abstract            312 non-null    object
 5   cleaned_abstract    312 non-null    object
 6   tokenized_abstract  312 non-null    object
dtypes: int64(1), object(6)
memory usage: 17.2+ KB


In [13]:
# 빈도 저장
df_all.to_excel(r"C:/Users/user/Desktop/토픽모델링_교수님/Data/df_kr_final_stopwords.xlsx")

## 단어 별 빈도 확인

In [14]:
# 추출 단어 확인(10개 이상)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=extract_nouns_and_remove_stopwords , min_df=5)
dtm = cv.fit_transform(df_all["cleaned_abstract"])



In [15]:
# 단어 목록
cv.get_feature_names_out()

array(['ability', 'access', 'accordance', 'achievement', 'acquisition',
       'act', 'activation', 'activities', 'activity', 'addition',
       'adjustment', 'administration', 'adult', 'adults', 'advantage',
       'age', 'agencies', 'aim', 'alternatives', 'analyses', 'analysis',
       'application', 'approach', 'area', 'areas', 'art', 'article',
       'arts', 'aspect', 'aspects', 'association', 'associations',
       'athlete', 'athletes', 'atmosphere', 'attachment', 'attention',
       'attitude', 'attributes', 'author', 'authority', 'awareness',
       'background', 'badminton', 'base', 'baseball', 'basis', 'behavior',
       'behaviors', 'benefit', 'benefits', 'birth', 'board', 'bodies',
       'body', 'bowling', 'brand', 'budget', 'building', 'burden',
       'business', 'campaign', 'capacity', 'capital', 'care', 'career',
       'case', 'cases', 'categories', 'category', 'center', 'centers',
       'century', 'certification', 'challenges', 'change', 'changes',
       'characte

In [16]:
# 단어 빈도 데이터 프레임
word_count = pd.DataFrame({
    'word': cv.get_feature_names_out(),
    'count': dtm.sum(axis=0).flat
})

word_count_desc = word_count.sort_values('count', ascending=False)
word_count_desc.head(15)

Unnamed: 0,word,count
509,school,546
20,analysis,361
182,education,230
151,data,229
7,activities,226
218,facilities,224
404,participation,204
98,community,201
569,system,198
555,students,197


In [17]:
# 빈도 저장
word_count_desc.to_excel(r"C:/Users/user/Desktop/토픽모델링_교수님/Data/kr_extract_nouns_Frequency_stopwords.xlsx")

In [18]:
# 데이터프레임을 튜플 리스트로 변환
word_list = list(zip(word_count_desc['word'], word_count_desc['count']))

In [19]:
# 토크나이징한 최종 토큰 수 산출
all_words = []

for row in df_all["tokenized_abstract"]:
    all_words.extend(row)

all_words_df = pd.DataFrame(all_words, columns=["words"])

In [20]:
# 모든 단어 저장(중복 허용_워드클라우드용)
all_words_df.to_excel(r"C:/Users/user/Desktop/토픽모델링_교수님/Data/kr_extract_nouns_stopwords.xlsx")