In [14]:
%run package_import.ipynb

In [20]:
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
stop=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
import gensim
import string
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from keras.initializers import Constant
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

Using TensorFlow backend.


In [12]:
from sqlalchemy import create_engine
import pymysql

sqlEngine = create_engine('mysql+pymysql://root:@127.0.0.1/nlp_with_disaster_tweets', pool_recycle=3600)
dbConnection = sqlEngine.connect()

### Read combined data

In [58]:
df_combined = pd.read_sql_query(
'''
select *
from transformed_combination
'''
,dbConnection)

df_train = df_combined[df_combined['source_file']=='train']
df_test = df_combined[df_combined['source_file']=='test']

In [25]:
def create_corpus(df: pd.DataFrame) ->list:
    '''
    The function is to convert column "text" into a list of words at each row

    input:
    df: dataframe where text whose words are splited

    output:
    return a list of words for each row
    '''
    corpus = []
    
    for tweet in tqdm(df['text']):
        words = [word.lower() for word in word_tokenize(tweet) if (word.isalpha() == 1) & (word not in stop)]

        corpus.append(words)

    return corpus


In [27]:
corpus = create_corpus(df_combined)

100%|██████████| 10876/10876 [00:02<00:00, 5332.82it/s]


#### Convert glove.6B.100d.txt to a dictionary

In [48]:
embedding_dict={}
with open('../../Project_data/nlp_with_disaster_tweets/glove.6B.100d.txt','r') as f:
    for line in tqdm(f):
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

400000it [00:07, 52672.86it/s]


<font color='red'> ** Need to understand the following cells ** </font>

<a id="Summary"></a>
<div class="alert alert-block alert-success">
    <font color = 'black'>
        <h2>Tokenization</h2>
        <h3>What is Tokenization?</h3>
    </font>
</div>

In [49]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [109]:
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)
sequences

[[653, 5321, 733, 194, 87, 1777, 3465, 28],
 [100, 8, 143, 838, 7717, 7718, 1172],
 [125, 1442, 1359, 627, 7719, 1642, 78, 200, 2303, 627, 1527, 1124],
 [13, 4153, 1222, 200, 1527, 41],
 [136, 27, 1360, 224, 5322, 1922, 206, 1222, 7720, 137],
 [2940, 336, 41, 1923, 776, 7721, 318, 1125, 319, 8, 5323, 1222],
 [139, 19, 805, 240, 976, 777, 225, 1643, 7722, 977, 2565, 1644],
 [2, 155, 1924, 2, 43, 8, 4154],
 [147, 18, 200, 1126, 480, 778, 580],
 [2, 2078, 518, 178, 368],
 [440, 13, 428, 209, 356, 481],
 [839,
  519,
  2566,
  171,
  2567,
  581,
  7,
  675,
  2,
  129,
  15,
  519,
  2566,
  79,
  690,
  2,
  179,
  91,
  81,
  79,
  690,
  2,
  179,
  91,
  81,
  7723,
  225],
 [3466, 225, 1778, 7724, 2566, 564, 2, 691, 3467],
 [139, 5324, 867, 58, 1528, 5324],
 [176, 137, 384, 4155, 67, 34, 289],
 [79, 49],
 [2, 53, 5325],
 [184, 1529],
 [29, 67, 760],
 [79, 7725],
 [3468],
 [1075, 542],
 [53, 7726],
 [79, 2304, 51],
 [7727],
 [78, 92, 2, 47, 1127, 156],
 [779, 1288, 69, 277],
 [53, 256

In [51]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 18736


In [52]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

100%|██████████| 18736/18736 [00:00<00:00, 449930.89it/s]


In [54]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(lr=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

In [55]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 100)           1873700   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 50, 100)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,916,005
Trainable params: 42,305
Non-trainable params: 1,873,700
_________________________________________________________________


In [97]:
train=tweet_pad[:df_train.shape[0]]
test=tweet_pad[df_train.shape[0]:]

In [98]:
X_train,X_test,y_train,y_test=train_test_split(train,df_train['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (6471, 50)
Shape of Validation  (1142, 50)


In [99]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

Train on 6471 samples, validate on 1142 samples
Epoch 1/15
 - 94s - loss: 0.5334 - acc: 0.7594 - val_loss: 0.4858 - val_acc: 0.7820
Epoch 2/15
 - 94s - loss: 0.5381 - acc: 0.7578 - val_loss: 0.4857 - val_acc: 0.7820
Epoch 3/15
 - 93s - loss: 0.5328 - acc: 0.7643 - val_loss: 0.4876 - val_acc: 0.7820
Epoch 4/15
 - 95s - loss: 0.5304 - acc: 0.7606 - val_loss: 0.4871 - val_acc: 0.7820
Epoch 5/15
 - 94s - loss: 0.5224 - acc: 0.7691 - val_loss: 0.4861 - val_acc: 0.7828
Epoch 6/15
 - 93s - loss: 0.5290 - acc: 0.7665 - val_loss: 0.4867 - val_acc: 0.7846
Epoch 7/15
 - 92s - loss: 0.5271 - acc: 0.7668 - val_loss: 0.4862 - val_acc: 0.7846
Epoch 8/15
 - 93s - loss: 0.5247 - acc: 0.7639 - val_loss: 0.4864 - val_acc: 0.7855
Epoch 9/15
 - 92s - loss: 0.5184 - acc: 0.7711 - val_loss: 0.4868 - val_acc: 0.7820
Epoch 10/15
 - 94s - loss: 0.5226 - acc: 0.7659 - val_loss: 0.4859 - val_acc: 0.7820
Epoch 11/15
 - 94s - loss: 0.5316 - acc: 0.7583 - val_loss: 0.4852 - val_acc: 0.7855
Epoch 12/15
 - 93s - loss:

In [100]:
sample_sub = pd.read_csv('../../Project_data/nlp_with_disaster_tweets/sample_submission.csv')

In [106]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('./submit/submission.csv',index=False)