In [1]:
# What version of Python do you have?
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
gpu = len(tf.config.list_physical_devices('GPU'))>0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

Tensor Flow Version: 2.6.0
Keras Version: 2.6.0

Python 3.9.7 | packaged by conda-forge | (default, Sep 29 2021, 19:24:02) 
[Clang 11.1.0 ]
Pandas 1.3.4
Scikit-Learn 1.0.1
GPU is available


In [2]:
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
stop=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
# import gensim
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,SpatialDropout1D
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
# from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import SGD

In [3]:
import pandas as pd
import numpy as np

In [4]:
from sqlalchemy import create_engine
import pymysql

sqlEngine = create_engine('mysql+pymysql://root:@127.0.0.1/nlp_with_disaster_tweets', pool_recycle=3600)
dbConnection = sqlEngine.connect()

### Read combined data

In [5]:
df_combined = pd.read_sql_query(
'''
select *
from transformed_combination
'''
,dbConnection)

df_train = df_combined[df_combined['source_file']=='train']
df_test = df_combined[df_combined['source_file']=='test']

In [6]:
def create_corpus(df: pd.DataFrame) ->list:
    '''
    The function is to convert column "text" into a list of words at each row

    input:
    df: dataframe where text whose words are splited

    output:
    return a list of words for each row
    '''
    corpus = []
    
    for tweet in tqdm(df['text']):
        words = [word.lower() for word in word_tokenize(tweet) if (word.isalpha() == 1) & (word not in stop)]

        corpus.append(words)

    return corpus


In [7]:
corpus = create_corpus(df_combined)

100%|██████████| 10876/10876 [00:01<00:00, 10675.91it/s]


#### Convert glove.6B.100d.txt to a dictionary

In [8]:
embedding_dict={}
with open('../../Project_data/nlp_with_disaster_tweets/glove.6B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

<font color='red'> ** Need to understand the following cells ** </font>

<a id="Summary"></a>
<div class="alert alert-block alert-success">
    <font color = 'black'>
        <h2>Tokenization</h2>
        <h3>What is Tokenization?</h3>
    </font>
</div>

In [9]:
MAX_LEN=50
tokenizer_obj=Tokenizer()
# Updates internal vocabulary based on a list of texts
# fit_on_text: the function is to return the index of each word
# by following instructions:
# 1. get frequency for each word for the whole list (e.g. corpus)
# 2. sort the word by frequency of words by descending
# 3. create index by the order of the words' frequency from step 2.
# For example, the most frequency of a word is 1 
tokenizer_obj.fit_on_texts(corpus)
# texts_to_sequences: use the index from the function fit_on_text
# to map each word to index
sequences=tokenizer_obj.texts_to_sequences(corpus)
# make each list extend to max_len. If the number of items is lower than max_len,
# append 0 ; else truncating the bottom items until the length of the list same as max_len
tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

The following scripts is a simple versioin of function fit_on_text<br>
The original one is in the following linke <https://github.com/keras-team/keras-preprocessing/blob/1.1.2/keras_preprocessing/text.py#L199-L251>

In [10]:
# document_count = 0
# word_counts, word_docs = {}, {}

# for text in corpus:
#     document_count+=1
#     text = [text_elem.lower() for text_elem in text]
#     seq = text 

#     for w in seq:
#         if w in word_counts:
#             word_counts[w] += 1
#         else:
#             word_counts[w] = 1

#     for w in set(seq):
#         if w in word_docs:
#             word_docs[w] += 1
#         else:
#             word_docs[w] = 1
# wcounts = list(word_counts.items())
# wcounts.sort(key=lambda x: x[1], reverse=True)

# sorted_voc = []
# sorted_voc.extend(wc[0] for wc in wcounts)

# word_index = dict(
#             zip(sorted_voc, list(range(1, len(sorted_voc) + 1))))

# index_word = {c: w for w, c in word_index.items()}
# index_docs={}
# for w, c in list(word_docs.items()):
#     index_docs[word_index[w]] = c

In [11]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 18736


In [12]:
num_words=len(word_index)+1
# create a new array with num_words sub-list and 100 items in the sub-list filled with zero
embedding_matrix=np.zeros((num_words,100))

# the loop is to fill glove.6B.100d.txt for each word
for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

100%|██████████| 18736/18736 [00:00<00:00, 881822.34it/s]


<font color='red'> to be investigated!!!!</font>

In [13]:
model=Sequential()

embedding=Embedding(num_words,100,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
# model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))


optimzer=Adam(learning_rate=1e-5)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])

Metal device set to: Apple M1 Max

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2021-11-27 22:11:55.888790: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-27 22:11:55.888954: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           1873700   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 50, 100)           0         
_________________________________________________________________
dense (Dense)                (None, 50, 1)             101       
Total params: 1,873,801
Trainable params: 101
Non-trainable params: 1,873,700
_________________________________________________________________


In [15]:
train=tweet_pad[:df_train.shape[0]]
test=tweet_pad[df_train.shape[0]:]

In [16]:
X_train,X_test,y_train,y_test=train_test_split(train,df_train['target'].values,test_size=0.15)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

Shape of train (6471, 50)
Shape of Validation  (1142, 50)


In [17]:
history=model.fit(X_train,y_train,batch_size=4,epochs=15,validation_data=(X_test,y_test),verbose=2)

2021-11-27 22:12:10.830362: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-27 22:12:10.830575: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-11-27 22:12:10.962704: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 1/15


2021-11-27 22:12:18.656721: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


1618/1618 - 9s - loss: 0.7102 - accuracy: 0.5611 - val_loss: 0.7046 - val_accuracy: 0.5749
Epoch 2/15
1618/1618 - 8s - loss: 0.7076 - accuracy: 0.5653 - val_loss: 0.7019 - val_accuracy: 0.5752
Epoch 3/15
1618/1618 - 8s - loss: 0.7054 - accuracy: 0.5659 - val_loss: 0.6996 - val_accuracy: 0.5758
Epoch 4/15
1618/1618 - 8s - loss: 0.7027 - accuracy: 0.5667 - val_loss: 0.6976 - val_accuracy: 0.5776
Epoch 5/15
1618/1618 - 8s - loss: 0.7004 - accuracy: 0.5678 - val_loss: 0.6959 - val_accuracy: 0.5783
Epoch 6/15
1618/1618 - 8s - loss: 0.6988 - accuracy: 0.5686 - val_loss: 0.6944 - val_accuracy: 0.5793
Epoch 7/15
1618/1618 - 8s - loss: 0.6974 - accuracy: 0.5693 - val_loss: 0.6930 - val_accuracy: 0.5795
Epoch 8/15
1618/1618 - 8s - loss: 0.6960 - accuracy: 0.5702 - val_loss: 0.6918 - val_accuracy: 0.5809
Epoch 9/15
1618/1618 - 9s - loss: 0.6952 - accuracy: 0.5703 - val_loss: 0.6907 - val_accuracy: 0.5819
Epoch 10/15
1618/1618 - 8s - loss: 0.6940 - accuracy: 0.5711 - val_loss: 0.6897 - val_accurac

In [18]:
sample_sub = pd.read_csv('../../Project_data/nlp_with_disaster_tweets/sample_submission.csv')

In [20]:
y_pre=model.predict(test)
y_pre=np.round(y_pre).astype(int).reshape(3263)
sub=pd.DataFrame({'id':sample_sub['id'].values.tolist(),'target':y_pre})
sub.to_csv('./submit/submission.csv',index=False)

ValueError: cannot reshape array of size 163150 into shape (3263,)