In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_table('/kaggle/input/sentiment-analysis-on-movie-reviews/train.tsv.zip')
train

In [None]:
test = pd.read_table('/kaggle/input/sentiment-analysis-on-movie-reviews/test.tsv.zip')
test

In [None]:
all_data = pd.concat([train,test])
all_data

In [None]:
all_data.iloc[0,2]

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
tk = Tokenizer()
tk.fit_on_texts(all_data['Phrase'])
tk.word_index

In [None]:
text = tk.texts_to_sequences(all_data['Phrase'])
print(text[0])

In [None]:
pd.Series(text).apply(len)

In [None]:
import seaborn as sns
sns.distplot(pd.Series(text).apply(len))

In [None]:
# 문장길이 통일(padding)
from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_text = pad_sequences(text,maxlen=30)
pad_text.shape

In [None]:
train_2 = pad_text[:len(train)]
test_2 = pad_text[len(train):]

In [None]:
def load_embedding(path):
    embeddings = {}
    with open(path) as f:
        for line in f:
            values = line.rstrip().split()  # rstrip() --> 맨 오른쪽 띄어쓰기를 없애주기 위함
            word = values[0]   # word 에 영단어 할당
            vector = np.asarray(values[1:],dtype=np.float32)   
            # ram 터지는것 방지(array쓰면 RAM 소모량 감소) 
            # vector 숫자에 있는 64비트 float을 32비트 float으로 바꿔서 메모리 소모량 감소시킴
            
            embeddings[word] = vector
    return embeddings

embeddings = load_embedding('/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec')

In [None]:
def filter_embedding(embeddings,word_index,vocab_size,dim):
    embedding_matrix = np.zeros([vocab_size,dim])  # 우리 데이터셋의 단어크기에 맞는 행렬생성
    for word,i in word_index.items():
        vector = embeddings.get(word)   # get 함수 실행 : 단어가 없어도 출력값이 none으로 나오게 됨
        if vector is not None:     # vector가 NaN이 아닐때
            embedding_matrix[i] = vector
    return embedding_matrix

embedding_matrix = filter_embedding(embeddings,tk.word_index,len(tk.word_index)+1,300) 
# 300차원 --> 크롤링 파일이 300차원
print(embedding_matrix[1])   # the의 embedding

In [None]:
# 겹치는 단어 체크
len(set(tk.word_index) - set(embeddings))
# set(tk.word_index) - set(embeddings)

In [None]:
from tensorflow.keras import *
from tensorflow.keras.layers import *
model = Sequential()
model.add(Embedding(len(tk.word_index)+1,300,input_length=30,weights=[embedding_matrix],
                   trainable=False))   
# 300 --> embedding_matrix의 300차원
# weights=[embedding_matrix] --> 학습할 단어들
# trainable = False --> 이미 Embedding 들어가 있는데 굳이 재학습할 필요 X
#model.add(Flatten())
model.add(SpatialDropout1D(0.25))
# Dropout --> Embedding층 직후 실행
# SpatialDropout : 학습시 특정단어(감정관련 형용사)가 없어도 잘 예측할수 있게끔 도움
model.add(Bidirectional(LSTM(32,return_sequences=True)))
# return_sequences = True 좀더 sequences 안의 단어들을 더 잘 기억/이해하도록 재학습하는 옵션
# 각 단어마다 한 차원이 출력되어 파라미터수가 늘어남
# 재학습을 위한 차원 하나가 더 늘어남 
model.add(Flatten())
# --> model.add(Flatten())으로 차원을 늘려줌
model.add(Dense(32,activation='relu'))  # Dense층 추가
model.add(Dense(5,activation='softmax'))
model.compile(metrics=['acc'],loss='sparse_categorical_crossentropy',optimizer='adam')
# sparse --> 숫자일때 회귀라고 인식안하게(분류라고 인식하게)
model.fit(train_2,train['Sentiment'],epochs=10)

In [None]:
model_2 = Sequential()
model_2.add(Embedding(len(tk.word_index)+1,300,input_length=30,weights=[embedding_matrix],
                   trainable=False))   
model_2.add(SpatialDropout1D(0.25))
model_2.add(Bidirectional(GRU(32,return_sequences=True)))
model_2.add(Flatten())
model_2.add(Dense(32,activation='relu'))  
model_2.add(Dense(5,activation='softmax'))
model_2.compile(metrics=['acc'],loss='sparse_categorical_crossentropy',optimizer='adam')
model_2.fit(train_2,train['Sentiment'],epochs=10)

In [None]:
# Ensemble
result_1 = model.predict(test_2)
result_2 = model_2.predict(test_2)

result = result_1 * 0.5 + result_2 * 0.5

In [None]:
sub = pd.read_csv('/kaggle/input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv')
# test의 Phrase가 train에 동일하게 있다면 모델의 result에서 틀린값을 train 정답으로 정정하는 꼼수
#result_class = result.argmax(1)
result_class = result.argmax(1)
mapping = {phrase:sentiment for _,_,phrase,sentiment in train.values}
# mapping

for i,phrase in enumerate(test['Phrase']):
    if phrase in mapping:
        result_class[i] = mapping[phrase]   

In [None]:
sub['Sentiment'] = result_class
sub.to_csv('result.csv',index=False)

In [None]:
# 추가로 GRU를 써서 앙상블 하거나 K-fold 과정 추가하기!