In [14]:
%run package_import.ipynb

In [20]:
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
stop=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

Using TensorFlow backend.


In [12]:
from sqlalchemy import create_engine
import pymysql

sqlEngine = create_engine('mysql+pymysql://root:@127.0.0.1/nlp_with_disaster_tweets', pool_recycle=3600)
dbConnection = sqlEngine.connect()

### Read combined data

In [58]:
df_combined = pd.read_sql_query(
'''
select *
from transformed_combination
'''
,dbConnection)

df_train = df_combined[df_combined['source_file']=='train']
df_test = df_combined[df_combined['source_file']=='test']

In [25]:
def create_corpus(df: pd.DataFrame) ->list:
    '''
    The function is to convert column "text" into a list of words at each row

    input:
    df: dataframe where text whose words are splited

    output:
    return a list of words for each row
    '''
    corpus = []
    
    for tweet in tqdm(df['text']):
        words = [word.lower() for word in word_tokenize(tweet) if (word.isalpha() == 1) & (word not in stop)]

        corpus.append(words)

    return corpus


In [27]:
corpus = create_corpus(df_combined)

100%|██████████| 10876/10876 [00:02<00:00, 5332.82it/s]


In [48]:
embedding_dict={}
with open('../../Project_data/nlp_with_disaster_tweets/glove.6B.100d.txt','r') as f:
    for line in tqdm(f):
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

400000it [00:07, 52672.86it/s]


<font color='red'> ** Need to understand the following cells ** </font>

In [49]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [51]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 18736


In [52]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

100%|██████████| 18736/18736 [00:00<00:00, 449930.89it/s]


In [54]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(lr=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [55]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 100)           1873700   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,916,005
Trainable params: 42,305
Non-trainable params: 1,873,700
_________________________________________________________________


In [65]:
len(tweet_pad)

10876

In [66]:
train=tweet_pad[:df_train.shape[0]]
test=tweet_pad[df_test.shape[0]:]

In [68]:
X_train,X_test,y_train,y_test=train_test_split(train,df_train['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (6471, 50)
Shape of Validation  (1142, 50)


In [69]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

Train on 6471 samples, validate on 1142 samples
Epoch 1/15
 - 94s - loss: 0.6916 - acc: 0.5679 - val_loss: 0.6889 - val_acc: 0.5832
Epoch 2/15
 - 92s - loss: 0.6868 - acc: 0.5681 - val_loss: 0.6773 - val_acc: 0.5832
Epoch 3/15
 - 93s - loss: 0.6219 - acc: 0.6592 - val_loss: 0.5397 - val_acc: 0.7592
Epoch 4/15
 - 93s - loss: 0.5875 - acc: 0.7104 - val_loss: 0.5173 - val_acc: 0.7715
Epoch 5/15
 - 93s - loss: 0.5737 - acc: 0.7255 - val_loss: 0.5067 - val_acc: 0.7837
Epoch 6/15
 - 93s - loss: 0.5664 - acc: 0.7333 - val_loss: 0.4987 - val_acc: 0.7811
Epoch 7/15
 - 92s - loss: 0.5539 - acc: 0.7461 - val_loss: 0.4901 - val_acc: 0.7846
Epoch 8/15
 - 92s - loss: 0.5510 - acc: 0.7493 - val_loss: 0.4862 - val_acc: 0.7837
Epoch 9/15
 - 92s - loss: 0.5553 - acc: 0.7433 - val_loss: 0.4828 - val_acc: 0.7951
Epoch 10/15
 - 92s - loss: 0.5475 - acc: 0.7475 - val_loss: 0.4794 - val_acc: 0.7907
Epoch 11/15
 - 92s - loss: 0.5475 - acc: 0.7487 - val_loss: 0.4786 - val_acc: 0.7942
Epoch 12/15
 - 93s - loss: