In [8]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import numpy as np
from collections import defaultdict
from collections import  Counter
from tqdm import tqdm
import re

from nltk.corpus import stopwords, wordnet
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
stop=set(stopwords.words('english'))
import gensim
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dropout,Dense,SpatialDropout1D, Bidirectional,GlobalMaxPool1D, CuDNNLSTM
from keras.initializers import Constant
from keras.optimizers import Adam

# 데이터 불러오기

In [2]:
train = pd.read_csv('C:/Users/ASUS/Documents/kaggle data/Real or Not NLP with Disaster Tweets/nlp-getting-started/train.csv')
test = pd.read_csv('C:/Users/ASUS/Documents/kaggle data/Real or Not NLP with Disaster Tweets/nlp-getting-started/test.csv')

# Data cleaning

In [3]:
#URL, HTML, punc 제거
def remove_URL_HTML_punc(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    html = re.compile(r'<.*?>')
    table = str.maketrans('', '', string.punctuation)
    
    text = url.sub("", text)
    text = html.sub("", text)
    text = text.translate(table)
    
    return text

train['text'] = train['text'].apply(lambda x: remove_URL_HTML_punc(x))
test['text'] = test['text'].apply(lambda x: remove_URL_HTML_punc(x))

In [9]:
#Lemmatization(동사만 원형으로 바꿔줌)
wnl = WordNetLemmatizer()

def lemmatize_sentence(sentence):
    sentence_words = sentence.split(' ')
    new_sentence_words = []
    
    for sentence_word in sentence_words:
        sentence_word = sentence_word.replace('#', '')
        new_sentence_word = wnl.lemmatize(sentence_word.lower(), wordnet.VERB)
        new_sentence_words.append(new_sentence_word)
        
    new_sentence = ' '.join(new_sentence_words)
    new_sentence = new_sentence.strip()
    
    return new_sentence

train['text'] = train['text'].apply(lambda x : lemmatize_sentence(x))
test['text'] = test['text'].apply(lambda x : lemmatize_sentence(x))

In [10]:
#불용어 제거 & 코퍼스 만들기
def remove_stop(df):
    corpus = []
    for tweet in df['text']:
        words = [word for word in word_tokenize(tweet) if (word not in stop)]
        corpus.append(words)
        
    return corpus

df = pd.concat([train,test])
corpus = remove_stop(df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


# Vectorization using GloVe

In [11]:
#pretrained embedding 불러오기
embedding_dict = {}
with open('C:/Users/ASUS/Documents/kaggle data/Real or Not NLP with Disaster Tweets/nlp-getting-started/glove.6B.100d.txt', 'r',encoding='UTF8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.asarray(values[1:], 'float32')
        embedding_dict[word] = vectors
f.close()

In [12]:
#입력 형태 만들기
MAX_LEN = 50
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences = tokenizer_obj.texts_to_sequences(corpus) #각 단어들을 numbering으로 표시
tweet_pad = pad_sequences(sequences, maxlen=MAX_LEN, truncating='post',padding = 'post') #모든 문장이 동일한 길이를 갖도록 zeor padding(LSTM의 입력으로 들어가야 하기 때문)

In [13]:
#각 단어에 번호 부여하기
word_index = tokenizer_obj.word_index
print('Number of uniqye words : ' ,len(word_index))

Number of uniqye words :  20483


In [14]:
#모든 단어에 glove embedding vector 지정하기
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec = embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i] = emb_vec

100%|████████████████████████████████████████████████████████████████████████| 20483/20483 [00:00<00:00, 662532.61it/s]


# make datasets

In [15]:
trian_set = tweet_pad[:train.shape[0]]
test_set = tweet_pad[train.shape[0]:]

X_train, X_val, y_train, y_val = train_test_split(trian_set, train['target'].values, test_size = 0.15)
print('Shape of train', X_train.shape)
print("Shape if Validation ", X_val.shape)

Shape of train (6471, 50)
Shape if Validation  (1142, 50)


# Modeling

In [16]:
model = Sequential()

embedding = Embedding(num_words, 100, embeddings_initializer = Constant(embedding_matrix), input_length = MAX_LEN, trainable = False)
model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(GlobalMaxPool1D())
model.add(Dense(100, activation = 'relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation = 'sigmoid'))

optimizer = Adam(lr=1e-5)

model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])

model.summary()




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           2048400   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 50, 100)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 128)           84992     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 128)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 50, 128)           99328     
___________________________________________________________

In [17]:
#train
history= model.fit(X_train, y_train, batch_size = 8, epochs = 30, validation_data = (X_val, y_val), verbose = 2)

Train on 6471 samples, validate on 1142 samples
Epoch 1/30
 - 17s - loss: 0.6946 - acc: 0.5052 - val_loss: 0.6775 - val_acc: 0.6804
Epoch 2/30
 - 12s - loss: 0.6614 - acc: 0.6888 - val_loss: 0.6458 - val_acc: 0.7259
Epoch 3/30
 - 12s - loss: 0.6128 - acc: 0.7393 - val_loss: 0.5841 - val_acc: 0.7653
Epoch 4/30
 - 12s - loss: 0.5468 - acc: 0.7667 - val_loss: 0.5292 - val_acc: 0.7680
Epoch 5/30
 - 12s - loss: 0.5125 - acc: 0.7701 - val_loss: 0.5074 - val_acc: 0.7723
Epoch 6/30
 - 12s - loss: 0.4975 - acc: 0.7762 - val_loss: 0.4980 - val_acc: 0.7758
Epoch 7/30
 - 12s - loss: 0.4878 - acc: 0.7795 - val_loss: 0.4905 - val_acc: 0.7793
Epoch 8/30
 - 12s - loss: 0.4828 - acc: 0.7806 - val_loss: 0.4857 - val_acc: 0.7837
Epoch 9/30
 - 12s - loss: 0.4790 - acc: 0.7864 - val_loss: 0.4822 - val_acc: 0.7793
Epoch 10/30
 - 12s - loss: 0.4677 - acc: 0.7895 - val_loss: 0.4781 - val_acc: 0.7793
Epoch 11/30
 - 12s - loss: 0.4695 - acc: 0.7901 - val_loss: 0.4756 - val_acc: 0.7785
Epoch 12/30
 - 12s - loss:

# Submission

In [29]:
y_pred = model.predict(test_set)
y_pred=np.round(y_pred).astype(int).reshape(3263)
submission = pd.read_csv('C:/Users/ASUS/Documents/kaggle data/Real or Not NLP with Disaster Tweets/nlp-getting-started/sample_submission.csv')
submission.target = y_pred
submission.to_csv('C:/Users/ASUS/Documents/kaggle data/Real or Not NLP with Disaster Tweets/nlp-getting-started/submission.csv', index = False)