In [2]:
from collections import Counter

# 샘플 코퍼스
corpus = ["자연어 처리는 재미있다.", "자연어 처리는 컴퓨터 과학의 한 분야다.", "데이터 분석은 자연어 처리와 밀접하다."]

# 단어 단위로 코퍼스를 분리
tokenized_corpus = [sentence.split() for sentence in corpus]
flat_tokens = [word for sentence in tokenized_corpus for word in sentence]
print(flat_tokens)
# 단어 빈도 분석
word_freq = Counter(flat_tokens)
print("단어 빈도 분석 결과:", word_freq)

['자연어', '처리는', '재미있다.', '자연어', '처리는', '컴퓨터', '과학의', '한', '분야다.', '데이터', '분석은', '자연어', '처리와', '밀접하다.']
단어 빈도 분석 결과: Counter({'자연어': 3, '처리는': 2, '재미있다.': 1, '컴퓨터': 1, '과학의': 1, '한': 1, '분야다.': 1, '데이터': 1, '분석은': 1, '처리와': 1, '밀접하다.': 1})


In [3]:
from collections import Counter

corpus = ["""희영님 마파두부
도영이 차돌짬뽕면
헌상이 짜장면
양창일 차돌짬뽕밥
다빈이(따라쟁이) 차돌짬뽕밥
민채 간짜장 
민정 짬뽕
승원이(따라쟁이2) 차돌짬뽕밥"""]

# 줄 단위 분리
lines = corpus[0].splitlines()

# 이름 제거 (첫 단어 제거)
menus = []
for line in lines:
    tokens = line.split()
    menus.extend(tokens[1:])   # 첫 단어(이름) 제외

# 빈도 분석
word_freq = Counter(menus)
print(word_freq)

Counter({'차돌짬뽕밥': 3, '마파두부': 1, '차돌짬뽕면': 1, '짜장면': 1, '간짜장': 1, '짬뽕': 1})


In [4]:
from nltk.corpus import wordnet

# Synset 사용 예제
word = "bank"
synsets = wordnet.synsets(word)
for syn in synsets:
    print(f"뜻: {syn.definition()}")

뜻: sloping land (especially the slope beside a body of water)
뜻: a financial institution that accepts deposits and channels the money into lending activities
뜻: a long ridge or pile
뜻: an arrangement of similar objects in a row or in tiers
뜻: a supply or stock held in reserve for future use (especially in emergencies)
뜻: the funds held by a gambling house or the dealer in some gambling games
뜻: a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
뜻: a container (usually with a slot in the top) for keeping money at home
뜻: a building in which the business of banking transacted
뜻: a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
뜻: tip laterally
뜻: enclose with a bank
뜻: do business with a bank or keep an account at a bank
뜻: act as the banker in a game or in gambling
뜻: be in the banking business
뜻: put into a bank account
뜻: cover with ashes so to control the rate 

In [5]:
# pip install spacy
# python -m spacy download en_core_web_sm

import spacy
spacy.info()


# 영어 모델 로드 (토큰화, POS, NER, dependency parser 포함)
nlp = spacy.load("en_core_web_sm")

text = "Apple is looking at buying a startup in the UK for $1 billion."
doc = nlp(text)  #  이 한 줄이 '한 번에' 실행되는 부분

print("=== TOKENS / POS / DEP ===")
for token in doc:
    # token.text: 토큰
    # token.pos_: 품사
    # token.dep_: 의존관계(문법 역할)
    # token.head.text: 이 토큰이 의존하는 중심 단어
    print(f"{token.text:>10} | POS={token.pos_:<6} | DEP={token.dep_:<10} | HEAD={token.head.text}")

print("\n=== NAMED ENTITIES (NER) ===")
for ent in doc.ents:
    # ent.text: 개체명
    # ent.label_: 개체 타입 (ORG, GPE, MONEY 등)
    print(f"{ent.text:<20} -> {ent.label_}")


=== TOKENS / POS / DEP ===
     Apple | POS=PROPN  | DEP=nsubj      | HEAD=looking
        is | POS=AUX    | DEP=aux        | HEAD=looking
   looking | POS=VERB   | DEP=ROOT       | HEAD=looking
        at | POS=ADP    | DEP=prep       | HEAD=looking
    buying | POS=VERB   | DEP=pcomp      | HEAD=at
         a | POS=DET    | DEP=det        | HEAD=startup
   startup | POS=NOUN   | DEP=dobj       | HEAD=buying
        in | POS=ADP    | DEP=prep       | HEAD=buying
       the | POS=DET    | DEP=det        | HEAD=UK
        UK | POS=PROPN  | DEP=pobj       | HEAD=in
       for | POS=ADP    | DEP=prep       | HEAD=buying
         $ | POS=SYM    | DEP=quantmod   | HEAD=billion
         1 | POS=NUM    | DEP=compound   | HEAD=billion
   billion | POS=NUM    | DEP=pobj       | HEAD=for
         . | POS=PUNCT  | DEP=punct      | HEAD=looking

=== NAMED ENTITIES (NER) ===
Apple                -> ORG
UK                   -> GPE
$1 billion           -> MONEY


**서브워드(subword)**는 “단어를 더 작은 조각(부분 단위)”으로 쪼개서 토큰화하는 방식이야  
<예시>  
“삼성갤럭시S24”  
삼성 + 갤럭시 + S + 24 같은 식으로 나뉠 수 있음  