# 9.1 자연어 처리란

## 9.1.1 자연어 처리 용어 및 과정

In [1]:
# corpus
# token, tokenization
# stop words: 문장의 의미와 관계가 없으며 'a', 'the', 'she', 'he' 등등
# stemming: 단어를 기본 형태로 만드는 작업 ex) consigned, consigning, consignment -> consign
# part-of-speech tagging: 주어진 문장에서 품사를 식별하기 위해 붙여주는 태그 ex) Det, Noun, Verb, Prep, ...

In [2]:
!pip install nltk



In [3]:
# 문장 토큰화

import nltk

nltk.download('punkt')
text = nltk.word_tokenize('Is it possible distinguishing cats and dogs')
text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['Is', 'it', 'possible', 'distinguishing', 'cats', 'and', 'dogs']

In [4]:
# 태깅에 필요한 자원 내려받기

nltk.download('averaged_perception_tagger')

[nltk_data] Error loading averaged_perception_tagger: Package
[nltk_data]     'averaged_perception_tagger' not found in index


False

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [6]:
# 품사 태깅

nltk.pos_tag(text)

[('Is', 'VBZ'),
 ('it', 'PRP'),
 ('possible', 'JJ'),
 ('distinguishing', 'VBG'),
 ('cats', 'NNS'),
 ('and', 'CC'),
 ('dogs', 'NNS')]

## 9.1.2 자연어 처리를 위한 라이브러리

In [7]:
# nltk 라이브러리 호출 및 문장 정의

import nltk

# nltk.download('punkt')
string1 = "my favorite subject is math"
string2 = "my favorite subject is math, english, economic and computer science"
nltk.word_tokenize(string1)

['my', 'favorite', 'subject', 'is', 'math']

In [8]:
nltk.word_tokenize(string2)

['my',
 'favorite',
 'subject',
 'is',
 'math',
 ',',
 'english',
 ',',
 'economic',
 'and',
 'computer',
 'science']

In [9]:
!python3 -m pip install --upgrade pip

Collecting pip
  Downloading pip-22.1-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 4.9 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.1


In [10]:
!python3 -m pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m449.0/449.0 kB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0
[0m

In [11]:
# 라이브러리 호출 및 문장을 형태로 변환

from konlpy.tag import Komoran

komoran = Komoran()
print(komoran.morphs('딥러닝이 쉽나요? 어렵나요?'))

['딥러닝이', '쉽', '나요', '?', '어렵', '나요', '?']


In [12]:
# 품사 태깅

print(komoran.pos('소파 위에 있는 것이 고양이인가요? 강아지인가요?'))

[('소파', 'NNP'), ('위', 'NNG'), ('에', 'JKB'), ('있', 'VV'), ('는', 'ETM'), ('것', 'NNB'), ('이', 'JKS'), ('고양이', 'NNG'), ('이', 'VCP'), ('ㄴ가요', 'EF'), ('?', 'SF'), ('강아지', 'NNG'), ('이', 'VCP'), ('ㄴ가요', 'EF'), ('?', 'SF')]


In [13]:
!pip install -U gensim

Collecting gensim
  Downloading gensim-4.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-4.2.0
[0m

# 9.2 전처리

In [14]:
# 문장 -> 결측치 확인 토큰화 -> 단어 색인 -> 불용어 제거 -> 축소된 단어 색인 -> 어간 추출

## 9.2.1 결측치 확인

In [16]:
# 결측치를 확인할 데이터 호출

import pandas as pd
df = pd.read_csv('/content/class2.csv')
df

Unnamed: 0.1,Unnamed: 0,id,tissue,class,class2,x,y,r
0,0,mdb000,C,CIRC,N,535.0,475.0,192.0
1,1,mdb001,A,CIRA,N,433.0,268.0,58.0
2,2,mdb002,A,CIRA,I,,,
3,3,mdb003,C,CIRC,B,,,
4,4,mdb004,F,CIRF,I,488.0,145.0,29.0
5,5,mdb005,F,CIRF,B,544.0,178.0,26.0


In [17]:
# 결측치 개수 확인

df.isnull().sum()

Unnamed: 0    0
id            0
tissue        0
class         0
class2        0
x             2
y             2
r             2
dtype: int64

In [18]:
# 결측치 비율

df.isnull().sum() / len(df)

Unnamed: 0    0.000000
id            0.000000
tissue        0.000000
class         0.000000
class2        0.000000
x             0.333333
y             0.333333
r             0.333333
dtype: float64

In [19]:
# 결측치 삭제 처리

df = df.dropna(how='all')
print(df)

   Unnamed: 0      id tissue class class2      x      y      r
0           0  mdb000      C  CIRC      N  535.0  475.0  192.0
1           1  mdb001      A  CIRA      N  433.0  268.0   58.0
2           2  mdb002      A  CIRA      I    NaN    NaN    NaN
3           3  mdb003      C  CIRC      B    NaN    NaN    NaN
4           4  mdb004      F  CIRF      I  488.0  145.0   29.0
5           5  mdb005      F  CIRF      B  544.0  178.0   26.0


In [20]:
# 결측치 삭제 처리

df1 = df.dropna()
print(df1)

   Unnamed: 0      id tissue class class2      x      y      r
0           0  mdb000      C  CIRC      N  535.0  475.0  192.0
1           1  mdb001      A  CIRA      N  433.0  268.0   58.0
4           4  mdb004      F  CIRF      I  488.0  145.0   29.0
5           5  mdb005      F  CIRF      B  544.0  178.0   26.0


In [21]:
# 결측치를 0으로 채우기

df2 = df.fillna(0)
print(df2)

   Unnamed: 0      id tissue class class2      x      y      r
0           0  mdb000      C  CIRC      N  535.0  475.0  192.0
1           1  mdb001      A  CIRA      N  433.0  268.0   58.0
2           2  mdb002      A  CIRA      I    0.0    0.0    0.0
3           3  mdb003      C  CIRC      B    0.0    0.0    0.0
4           4  mdb004      F  CIRF      I  488.0  145.0   29.0
5           5  mdb005      F  CIRF      B  544.0  178.0   26.0


In [22]:
# 결측치를 평균으로 채우기

df['x'].fillna(df['x'].mean(), inplace=True)
print(df)

   Unnamed: 0      id tissue class class2      x      y      r
0           0  mdb000      C  CIRC      N  535.0  475.0  192.0
1           1  mdb001      A  CIRA      N  433.0  268.0   58.0
2           2  mdb002      A  CIRA      I  500.0    NaN    NaN
3           3  mdb003      C  CIRC      B  500.0    NaN    NaN
4           4  mdb004      F  CIRF      I  488.0  145.0   29.0
5           5  mdb005      F  CIRF      B  544.0  178.0   26.0


## 9.2.2 토큰화

In [23]:
# 문장 토큰화

from nltk import sent_tokenize
text_sample = """
Natural Language Processing, or NLP, is the process of extracting the meaning, or intent, behind human language.
In the field of Conversational artificial intelligence (AI), NLP allows machines and applications to understand the intent of human language inputs, and then generate appropriate responses, resulting in a natural conversation flow.
"""

tokenized_sentences = sent_tokenize(text_sample)
print(len(tokenized_sentences))

2


In [24]:
# 단어 토큰화

from nltk import word_tokenize
sentence = "This book is for deep learning learners"
words = word_tokenize(sentence)
print(words)

['This', 'book', 'is', 'for', 'deep', 'learning', 'learners']


In [25]:
# 아포스트로피가 포함된 문장에서 단어 토큰화

from nltk.tokenize import WordPunctTokenizer
sentence = "it's nothing that you don't already know except most people aren't aware of how their inner world works."
words = WordPunctTokenizer().tokenize(sentence)
print(words)

['it', "'", 's', 'nothing', 'that', 'you', 'don', "'", 't', 'already', 'know', 'except', 'most', 'people', 'aren', "'", 't', 'aware', 'of', 'how', 'their', 'inner', 'world', 'works', '.']


In [26]:
# 케라스를 이용한 단어 토큰화

from tensorflow.keras.preprocessing.text import text_to_word_sequence
sentence = "it's nothing that you don't already know except most people aren't aware of how their inner world works."
words = text_to_word_sequence(sentence)
print(words)

["it's", 'nothing', 'that', 'you', "don't", 'already', 'know', 'except', 'most', 'people', "aren't", 'aware', 'of', 'how', 'their', 'inner', 'world', 'works']


In [27]:
# 라이브러리 호출 및 데이터셋 준비

import csv
from konlpy.tag import Okt
from gensim.models import word2vec

f = open('/content/ratings_train.txt', 'r', encoding='utf-8')
rdr = csv.reader(f, delimiter='\t')
rdw = list(rdr)
f.close()

In [None]:
# 오픈 소스 한글 형태소 분석기 호출

twitter = Okt()

result = []
for line in rdw:
  malist = twitter.pos(line[1], norm=True, stem=True)
  r = []
  for word in malist:
    if not word[1] in ["Josa", "Eomi", "Punctuation"]:
      r.append(word[0])
    
  rl = (" ".join(r)).strip()
  result.append(rl)

In [None]:
# 형태소 저장

with open("NaverMovie.nlp", 'w', encoding='utf-8') as fp:
  fp.write("\n".join(result))

In [None]:
# Word2Vec 모델 생성

mData = word2vec.LineSentence("NaverMovie.nlp")
mModel = word2vec.Word2Vec(mData, size=200, window=10, hs=1, min_count=2, sg=1)
mModel.save("NaverMovie.model")

## 9.2.3 Stop words 제거

In [28]:
# 불용어 제거

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize

sample_text = "One of the first things that we ask ourselves is what are the pros and cons of any task we perform."
text_tokens = word_tokenize(sample_text)

tokens_without_sw = [word for word in text_tokens if not word in stopwords.words('english')]
print("불용어 제거 미적용:", text_tokens, '\n')
print("불용어 제거 적용:", tokens_without_sw)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
불용어 제거 미적용: ['One', 'of', 'the', 'first', 'things', 'that', 'we', 'ask', 'ourselves', 'is', 'what', 'are', 'the', 'pros', 'and', 'cons', 'of', 'any', 'task', 'we', 'perform', '.'] 

불용어 제거 적용: ['One', 'first', 'things', 'ask', 'pros', 'cons', 'task', 'perform', '.']


## 9.2.4 어간 추출

In [29]:
# 포터 알고리즘

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

print(stemmer.stem('obsess'), stemmer.stem('obsessed'))
print(stemmer.stem('standardizeds'), stemmer.stem('standardization'))
print(stemmer.stem('national'), stemmer.stem('nation'))
print(stemmer.stem('absentness'), stemmer.stem('absently'))
print(stemmer.stem('tribalical'), stemmer.stem('tribalicalized'))

obsess obsess
standard standard
nation nation
absent absent
tribal tribalic


In [30]:
# 랭커스터 알고리즘

from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('obsess'), stemmer.stem('obsessed'))
print(stemmer.stem('standardizeds'), stemmer.stem('standardization'))
print(stemmer.stem('national'), stemmer.stem('nation'))
print(stemmer.stem('absentness'), stemmer.stem('absently'))
print(stemmer.stem('tribalical'), stemmer.stem('tribalicalized'))

obsess obsess
standard standard
nat nat
abs abs
trib trib


In [31]:
# 표제어 추출

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

print(lemma.lemmatize('obsess'), lemma.lemmatize('obsessed'))
print(lemma.lemmatize('standardizeds'), lemma.lemmatize('standardization'))
print(lemma.lemmatize('national'), lemma.lemmatize('nation'))
print(lemma.lemmatize('absentness'), lemma.lemmatize('absently'))
print(lemma.lemmatize('tribalical'), lemma.lemmatize('tribalicalized'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
ob obsessed
standardizeds standardization
national nation
absentness absently
tribalical tribalicalized


In [32]:
print(lemma.lemmatize('obsess', 'v'), lemma.lemmatize('obsessed', 'a'))
print(lemma.lemmatize('standardizeds', 'v'), lemma.lemmatize('standardization', 'n'))
print(lemma.lemmatize('national', 'a'), lemma.lemmatize('nation', 'n'))
print(lemma.lemmatize('absentness', 'n'), lemma.lemmatize('absently', 'r'))
print(lemma.lemmatize('tribalical', 'a'), lemma.lemmatize('tribalicalized', 'v'))

obsess obsessed
standardize standardization
national nation
absentness absently
tribalical tribalicalized


## 9.2.5 정규화

In [33]:
# 라이브러리 호출

import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.python.data import Dataset
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers

In [38]:
# 데이터셋 로딩 및 모델 훈련

df = pd.read_csv('/content/covtype.csv')

x = df[df.columns[:54]]
y = df.Cover_Type

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=90)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(8, activation='softmax')                        
])

model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history1 = model.fit(
    x_train, y_train,
    epochs=26, batch_size=60,
    validation_data=(x_test, y_test)
)

Epoch 1/26
Epoch 2/26
Epoch 3/26
Epoch 4/26
Epoch 5/26
Epoch 6/26
Epoch 7/26
Epoch 8/26
Epoch 9/26
Epoch 10/26
Epoch 11/26
Epoch 12/26
Epoch 13/26
Epoch 14/26
Epoch 15/26
Epoch 16/26
Epoch 17/26
Epoch 18/26
Epoch 19/26
Epoch 20/26
Epoch 21/26
Epoch 22/26
Epoch 23/26
Epoch 24/26
Epoch 25/26
Epoch 26/26


In [39]:
# 데이터 정규화

from sklearn import preprocessing
df = pd.read_csv('/content/covtype.csv')
x = df[df.columns[:55]]
y = df.Cover_Type
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=90)

train_norm = x_train[x_train.columns[0:10]]
test_norm = x_test[x_test.columns[0:10]]

std_scale = preprocessing.StandardScaler().fit(train_norm)
x_train_norm = std_scale.transform(train_norm)

training_norm_col = pd.DataFrame(x_train_norm, index=train_norm.index, columns=train_norm.columns)
x_train.update(training_norm_col)
print(x_train.head())

x_test_norm = std_scale.transform(test_norm)
testing_norm_col = pd.DataFrame(x_test_norm, index=test_norm.index, columns=test_norm.columns)
x_test.update(testing_norm_col)
print(x_test.head())

        Elevation    Aspect     Slope  Horizontal_Distance_To_Hydrology  \
152044   0.222366 -0.228639 -0.412503                          0.148486   
363373   1.980490 -0.469989  0.255453                          3.018822   
372733  -1.081933  0.271939  0.389044                         -0.867895   
572846  -1.164122 -0.157128 -0.278912                         -1.267860   
114145  -0.052787  0.861906  0.255453                         -0.279711   

        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
152044                        0.149095                         1.336119   
363373                        4.443372                         0.168073   
372733                       -0.160093                        -0.241801   
572846                       -0.795646                        -0.461170   
114145                       -0.125739                         1.811419   

        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
152044       1.002687        0.539776     

In [40]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(8, activation='softmax')                        
])

model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history2 = model.fit(
    x_train, y_train,
    epochs=26, batch_size=60,
    validation_data=(x_test, y_test)
)

Epoch 1/26
Epoch 2/26
Epoch 3/26
Epoch 4/26
Epoch 5/26
Epoch 6/26
Epoch 7/26
Epoch 8/26
Epoch 9/26
Epoch 10/26
Epoch 11/26
Epoch 12/26
Epoch 13/26
Epoch 14/26
Epoch 15/26
Epoch 16/26
Epoch 17/26
Epoch 18/26
Epoch 19/26
Epoch 20/26
Epoch 21/26
Epoch 22/26
Epoch 23/26
Epoch 24/26
Epoch 25/26
Epoch 26/26
