In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Bidirectional, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

In [None]:
my_path = '/content/drive/MyDrive/쿠다/쿠다 3기' # 개인 작업 경로
project_path = '/content/drive/MyDrive/쿠다/쿠다 3기' # 작업 결과 파일을 저장할 공유 프로젝트 경로

def load_data(dir, file_name):
    df = pd.read_csv(os.path.join(dir, file_name))
    return df

train_raw = load_data(my_path, 'result.csv')
print(train_raw.head())

# 학습에 사용할 데이터
train_df = train_raw.copy()
print(train_df['Target'].value_counts())

                                           Utterance    Target
0  also I was the point person on my company s tr...   neutral
1                   You must ve had your hands full    neutral
2                            That I did  That I did    neutral
3      So let s talk a little bit about your duties    neutral
4                             My duties   All right   surprise
worry       9299
sadness     6779
neutral     4710
anger       4040
fear        2880
joy         2425
surprise    2065
Name: Target, dtype: int64


In [None]:
train_df.isnull().sum()

Utterance    11
Target        0
dtype: int64

In [None]:
train_df = train_df.dropna()

In [None]:
train_df.isnull().sum()

Utterance    0
Target       0
dtype: int64

Target 값들을 한번 숫자로 바꿔보겠습니다.
worry: 1 / sadness: 2 / anger: 3 / neutral: 4 / fear: 5 / joy: 6 / surprise: 7

In [None]:
train_df.loc[train_df.Target.str.contains('worry', na = False), 'Target'] = 0
train_df.loc[train_df.Target.str.contains('sadness', na = False), 'Target'] = 1
train_df.loc[train_df.Target.str.contains('anger', na = False), 'Target'] = 2
train_df.loc[train_df.Target.str.contains('neutral', na = False), 'Target'] = 3
train_df.loc[train_df.Target.str.contains('fear', na = False), 'Target'] = 4
train_df.loc[train_df.Target.str.contains('joy', na = False), 'Target'] = 5
train_df.loc[train_df.Target.str.contains('surprise', na = False), 'Target'] = 6

In [None]:
train_df.head()

Unnamed: 0,Utterance,Target
0,also I was the point person on my company s tr...,3
1,You must ve had your hands full,3
2,That I did That I did,3
3,So let s talk a little bit about your duties,3
4,My duties All right,6


In [None]:
# column engineering
train_df = train_df[["Utterance", "Target"]]
train_df.rename(columns = {'Utterance':'document'}, inplace=True)

# check data
display(train_df.head(10))

Unnamed: 0,document,Target
0,also I was the point person on my company s tr...,3
1,You must ve had your hands full,3
2,That I did That I did,3
3,So let s talk a little bit about your duties,3
4,My duties All right,6
5,Now you ll be heading a whole division so you...,3
6,I see,3
7,But there ll be perhaps people under you so...,3
8,Good to know,3
9,We can go into detail,3


In [None]:
! install pandas --upgrade
! install numpy --upgrade

install: unrecognized option '--upgrade'
Try 'install --help' for more information.
install: unrecognized option '--upgrade'
Try 'install --help' for more information.


In [None]:
 train_df = train_df.reset_index(drop=True)

In [None]:
#split train data
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in splitter.split(train_df, train_df["Target"]):
    train_data = train_df.loc[train_index]
    test_data = train_df.loc[test_index]

# rearrange indices
train_data.index = pd.RangeIndex(len(train_data.index))
test_data.index = pd.RangeIndex(len(test_data.index))

In [None]:
# check proportion
print(train_df.Target.value_counts()/len(train_df))
print(train_data.Target.value_counts()/len(train_data))
print(test_data.Target.value_counts()/len(test_data))

# data 확인
print(f"train data의 개수 : {len(train_data)}")
display(train_data.head(10))
print(f"test data의 개수 : {len(test_data)}")
display(test_data.head(10))

0    0.288750
1    0.210613
3    0.146239
2    0.125454
4    0.089477
5    0.075310
6    0.064156
Name: Target, dtype: float64
0    0.288749
1    0.210610
3    0.146258
2    0.125442
4    0.089479
5    0.075304
6    0.064158
Name: Target, dtype: float64
0    0.288754
1    0.210624
3    0.146163
2    0.125505
4    0.089469
5    0.075334
6    0.064150
Name: Target, dtype: float64
train data의 개수 : 25749


Unnamed: 0,document,Target
0,where is my melly belly when i need her,0
1,Pitching my voice to a tone of mild puzzlement...,6
2,I don t know I noticed that happens to me so...,0
3,Fine,2
4,i soon realized that an initial attraction to ...,5
5,i feel rather pissed off,2
6,Oooommmmggg you probs will too busy to slay...,0
7,Get off,2
8,Her Sweet n Los,5
9,i appreciate how clean their lifestyles are ev...,1


test data의 개수 : 6438


Unnamed: 0,document,Target
0,That little naked guy would be me,3
1,My throat hurts and I can t sleep,0
2,Really,6
3,Oh look at the little cat,6
4,when i learned that my former boyfriend had be...,1
5,Yeah,3
6,i feel awkward and so i start acting awkward lol,1
7,They paid you to go Think before voting eac...,0
8,Oh my God Those are my bedroom eyes Why did...,6
9,To cap things off the lower part of my back r...,0


In [None]:
# split labels
y_train = train_data[["Target"]]
y_test = test_data[["Target"]]

# one_hot encoding
y_train = to_categorical(y_train, 7)
y_test = to_categorical(y_test, 7)

In [None]:
import nltk
nltk.download('stopwords')

print('영어 불용어 갯수:',len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:40])

영어 불용어 갯수: 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import nltk
##처음 한번만 실행 후 주석처리##
nltk.download('all')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

In [None]:
def preprocessing(text, remove_stopwords=False):
  # 불용어 제거는 옵션으로 선택
  # 영어가 아닌 특수문자를 공백(" ")으로 대체
  text = re.sub("[^a-zA-Z]", " ", text)

  # 대문자를 소문자로 바꾸고 공백 단위로 텍스트를 나눠서 리스트로 만든다.
  words =text.lower().split()

  if remove_stopwords:
    stops = set(stopwords.words('english'))
    # 불용어가 아닌 단어로 이뤄진 새로운 리스트 생성
    words = [w for w in words if not w in stops]
    # 단어 리스트를 공백을 넣어서 하나의 글로 합친다.
    clean_text = ' '.join(words)

  else:
    # 불용어를 제거하지 않을 때
    clean_text = ' '.join(words)

  return clean_text

In [None]:
train_data['clean_text']=train_data['document'].apply(lambda x : preprocessing(text=x, remove_stopwords=True))
print(train_data['clean_text'])

test_data['clean_text']=test_data['document'].apply(lambda x : preprocessing(text=x, remove_stopwords=True))
print(test_data['clean_text'])

0                                         melly belly need
1        pitching voice tone mild puzzlement called rat...
2                           know noticed happens sometimes
3                                                     fine
4        soon realized initial attraction activity feel...
                               ...                        
25744                       feel useful valued fundamental
25745                                          love really
25746    think would agree feeling toes fingers go numb...
25747                           must feeling little cranky
25748    mustered energy feel christmassy remember feel...
Name: clean_text, Length: 25749, dtype: object
0                                  little naked guy would
1                                      throat hurts sleep
2                                                  really
3                                      oh look little cat
4       learned former boyfriend become engaged althou...
              

토큰화화

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['clean_text'])
train_sequences = tokenizer.texts_to_sequences(train_data['clean_text'])
test_sequences= tokenizer.texts_to_sequences(test_data['clean_text'])

In [None]:
print(test_sequences)

[[17, 1198, 190, 19], [685, 292, 87], [7], [8, 40, 17, 711], [970, 4789, 803, 465, 3124, 736, 474, 229, 170], [24], [1, 513, 161, 2205, 513, 122], [1705, 9, 11, 4716, 1613, 7442], [8, 64, 3020, 249, 96, 87], [8639, 43, 2509, 256, 23, 7, 292], [3415], [27], [537, 88, 85, 97, 40, 457, 2152, 1041], [2, 66, 221, 219, 142, 14732, 4388, 259, 2892, 230, 1529, 72, 1809, 687, 233, 178, 2102], [21, 23, 761, 120, 272, 9, 23, 74, 10, 119, 1163, 203, 593, 3117], [34, 101], [5, 1948, 11, 834, 7202, 188, 519, 2088, 1635, 1246, 2276, 3, 188, 1712, 1, 176], [60, 244, 2, 558, 108, 72], [87, 732, 1564, 2, 925, 115, 127, 9744, 3526, 4477, 115, 963], [350, 671, 2971, 182, 7, 777], [], [41, 74, 26, 2786, 2083, 787, 1, 160, 132], [34, 201, 1636, 62, 27], [280, 13, 2363, 637, 204, 231, 27], [4, 7, 825, 1730, 4, 7, 82, 137, 1138, 536, 2094, 4, 7, 2, 745, 18], [1090, 2563, 4597, 599, 46], [310, 94, 348, 94, 1277, 829, 407], [8, 8, 8], [2, 666, 288, 3993], [237, 17891, 4383, 5432, 42, 13, 6, 2502, 230, 1477, 42,

In [None]:
word_vocab = tokenizer.word_index
print(word_vocab)
print("전체 단어 개수:", len(word_vocab))

전체 단어 개수: 19461


In [None]:
data_configs = {}
data_configs['vocab'] = word_vocab
data_configs['vocab_size'] = len(word_vocab) + 1

패딩딩

In [None]:
# 문장 최대 길이
MAX_SEQUENCE_LENGTH = 64

# padding을 뒷부분에 한다.
train_inputs = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print('Shape of train data: ', train_inputs.shape)

test_inputs = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print('Shape of train data: ', test_inputs.shape)

Shape of train data:  (25749, 64)
Shape of train data:  (6438, 64)


패키지 준비


In [None]:
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
model=Sequential()
model.add(Embedding(19000, 100))
model.add(Dense(7, activation='softmax'))

In [None]:
es=EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc=ModelCheckpoint('best_model.h5', monitor='val_acc', verbose=1, save_best_only=True)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
history=model.fit(train_inputs, y_train, epochs=15, callbacks=[es, mc], batch_size=30, validation_split=0.2)

Epoch 1/15


ValueError: ignored

In [None]:
score = model.evaluate(test_inputs, y_test)
print('Test loss:', score[0])
print('Test accuracy:', score[1])