In [0]:
import numpy as np
import pandas as pd

from keras import Sequential
from keras.models import Model
from keras.utils import to_categorical
from keras.layers import Input,Dropout, Dense, Embedding, TimeDistributed, Bidirectional, GlobalMaxPooling1D, Conv1D, Flatten, MaxPooling1D, LSTM
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping
from keras.layers import Reshape

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle
from sklearn.metrics import f1_score,precision_score, recall_score, accuracy_score

import gensim
from gensim.models.word2vec import Word2Vec 

from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer()

from random import randint
import random


# Preprocessing Steps:


1.   Load Dataset (typhoon tweets, sentiment140)
2.   Build Word Embedding Model or Load Pre-trained Model
3.   Create Feature Extractor Model (Bi-LSTM)










## Helper Functions:

In [0]:
#loading sentiment140 dataset and project (tweets text,label). It also maps labels (4:1) in the original dataset.
def ingest():
    data = pd.read_csv('./tweets.csv', encoding = "ISO-8859-1") 
    data.drop(['ItemID', 'SentimentSource'], axis=1, inplace=True)
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map( {4:1,0:0})
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)    
    data=shuffle(data) #randmoize sequence of data
    print(('dataset loaded with shape', data.shape))    

    return data

# extract tweets text and label; also it maps label (4) to (1) as a positive 
def postprocess(data, n=1600000): # loading 1.6 million tweets
    data = data.head(n)
    data['tokens'] = data['SentimentText'].apply(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

#tokenizing tweets: clean hashtags,usernames, and stop words. return list of words
def tokenize(tweet):
    try:
        tokens = tokenizer.tokenize(tweet.lower())
        tokens = list([t for t in tokens if not t.startswith('@')])
        tokens = list([t for t in tokens if not t.startswith('#')])
        tokens = list([t for t in tokens if not t.startswith('http')])
        tokens = list([t for t in tokens if not t.startswith('https')])
        
        #keep only text tweets, ignore numbers
        tokens=list([t for t in tokens if t.isalpha()])
        
        return tokens
    except:
        return 'NC'

# build a labeledSentence from tweet's text to train the word embedding model.
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in enumerate(tweets):
        label = '%s_%s'%(label_type,i)
        labelized.append(TaggedDocument(v, [label]))

## Load Sentiment140 Dataset:


In [0]:
sentiment140=ingest() # loading 1.6 Million Labelled tweets
sentiment140=postprocess(sentiment140) # clearning and representing data as tweet and sentiment

# separate Training Features (X) from Labels (Y)
n=1600000 # ......data size 1.6 million tweets..
x_sentiment,y_sentiment = np.array(sentiment140.head(n).tokens),np.array(sentiment140.head(n).Sentiment)

## Load Typhoon Tweets:

In [0]:
typhoon_df=pd.DataFrame()
path ='./TED Dataset/Typhoons_tweets' # Use your path
allFiles = glob.glob(path + "/*.csv")

# merge all typhoon tweets into one file
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df) # append all tweets_lists into one list
    
typhoon_df = pd.concat(list_) # merage all tweets together.
typhoon_tweets=typhoon_df['text'].tolist()

tweetsTokens=list()

#Tweets preprocessing
for tweet in tqdm(typhoon_tweets):
    tweet=str(tweet)
    tweetsTokens.append(tokenize(tweet)) # clean tweets and append to tweets_list

typhoon_tweets_tokens=tweetsTokens # tokenized tweets.

## Build Word Embedding Model:

**If : you would to train a word embedding model -from scratch- RUN the first cell ; Else: RUN the 2nd cell to load a pre-trained model directly; **

In [0]:
# ---------------------------- Build A Word Embedding Model (Word2vec) ----------------------- #

# combine all twees together. 
allTweets=sentiment140['tokens'].tolist()+typhoon_tweets_tokens

# building all word embedding (Words)
allTweets = labelizeTweets(allTweets, 'AllTWEETS')

EMBEDDING_DIM=200 # word2vec dimension

word_emb = Word2Vec(size=n_dim, min_count=10,sg=1) #sg=1 Skipgram is used
word_emb.build_vocab([x.words for x in tqdm(allTweets)]) # words attribute by LabeledSentence
word_emb.train([x.words for x in tqdm(allTweets)],total_examples=len(allTweets),epochs=10)


In [0]:
# ----------------------- Load a Pre-trained Word Embedding Model -----------------#

EMBEDDING_DIM=200
# Load pre-trained word embedding model
word_emb=Word2Vec.load('./wordEmbedding.mdl')

## Enriching Word Embedding Model with Semantics Vectors from ConceptNet KG:
* Extract Entities Using Spacy
* Get Entities Vectors From ConceptNet
* Build Features Matrix Of Feature Extractor Model

### Extract Entities Using Spacy:

In [0]:
#Extract Entities From Tweets Using Spacy

tweets_entities=set() # list of recoginized entities

for tweet in tqdm(typhoon_tweets):
  
  entities=nlp(str(tweet))
  
  for ent in entities.ents:
    tweets_entities.add(ent.text)

### Get Entities Vectors From ConceptNet:

In [0]:
# Get Semantic Vectors From Knowledge Graph (ConceptNet)

# Load ConceptNet Embedding Model
conceptnet_model=gensim.models.KeyedVectors.load_word2vec_format('./numberbatch-en-17.06.txt.gz') # use your own path.

# Get list of words from ConceptNet Embedding Model
conceptnet_model_words=list(conceptnet_model.wv.vocab)
# Get corresponding semantic vectors of words
conceptnet_model_wordVectors=conceptnet_model[conceptnet_model.wv.vocab]

conceptnet_vectors={}

for w , vec in tqdm(zip(conceptnet_model_words,conceptnet_model_wordVectors)):
  conceptnet_vectors[w]=vec
  
  
# Loading word embeddiing vocabs
tweets_model_words=list(tweets_model.wv.vocab)
tweets_model_wordVectors=tweets_model[tweets_model.wv.vocab]

# maintain a dictionary to keep words with their corresponding vectors. 
# Entities are represented with their semantic vectors from ConceptNet Knowledage Graph

word_vectors_dict={}

# iterate over words list and vectors list and save into dict:
for word, vector in tqdm(zip(tweets_model_words,tweets_model_wordVectors)):
  
  # check if word is an entity and if word has a semantic vector
  if word in tweets_entities and word in conceptnet_vectors:     
      word_vectors_dict[word]=conceptnet_vectors[word]
      
  else: # otherwise get its word2vec
      word_vectors_dict[w]=vector
      
#------------ Buidling words indices ----------#
words_indices={} # from tweets2vec model

i=0
for w in list(word_vectors_dict.keys()):
  words_indices[w]=i
  i+=1

### Build Features Matrix Of Feature Extractor Model:

In [0]:
maxSeqLength = 20# based on average count of words per tweets in training dataset

sentiment_features = np.zeros((len(x_sentiment), maxSeqLength), dtype=int)

for instance in tqdm(x_sentiment):    
    vectors=np.zeros(maxSeqLength,dtype=int)
   
    j=0  # word counter
    for word in instance[:maxSeqLength]:
        if word in word_vectors_dict:
            vectors[j]=words_indices[word] # get word index

        j+=1
    
    sentiment_features[i]=vectors

## Encode tweets labels into one-hot vectors:



In [0]:
# representing class data as one-hot vectors
y_sentiment_ = np.array([y_sentiment]).reshape(-1)
one_hot_targets = np.eye(2)[y_sentiment_]

## Split data into train-test sets:

In [0]:
# split data to train and test
split_frac = 0.8
split_idx = int(len(sentiment_features)*split_frac)

senti_train_x, senti_val_x = sentiment_features[:split_idx], sentiment_features[split_idx:]
senti_train_y, senti_val_y = one_hot_targets[:split_idx], one_hot_targets[split_idx:]

# Baseline Models:

## Load TED dataset for the baseline models:

In [0]:
TED_dataset=np.load('TED.npy', allow_pickle=True)

Typhoon_Env=TED_dataset[:,0]
Typhoon_labels=[]
Typhoon_features=[]

for i in range (len(Typhoon_Env)):
  Typhoon_labels.append(Typhoon_Env[i][0])
  Typhoon_features.append(Typhoon_Env[i][1:-1])

# Encode typhoon labels into one-hot
label_encoder=LabelEncoder()
Typhoon_labels_encoded=label_encoder.fit_transform(Typhoon_labels)

Typhoon_labels_encoded = to_categorical(Typhoon_labels_encoded,dtype='int')

Typhoon_features=np.asarray(Typhoon_features)
Typhoon_labels_encoded=np.asarray(Typhoon_labels_encoded)

# split the dataset into train-test split
X_train, X_test, y_train, y_test = train_test_split(Typhoon_features,Typhoon_labels_encoded, test_size=0.20, random_state=42)

## The Baseline Model (CNN):

In [0]:
#create model
CNNmodel = Sequential()

#add model layers
CNNmodel.add(Conv1D(32, kernel_size=3, activation='relu'))
CNNmodel.add(Dropout(0.3))
CNNmodel.add(Conv1D(16, kernel_size=3, activation='relu'))
CNNmodel.add(Dropout(0.2))
CNNmodel.add(MaxPooling1D(pool_size=(8)))
CNNmodel.add(Flatten())
CNNmodel.add(Dense(4, activation='softmax'))

CNNmodel.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

### Traing and evaluate the CNN Model:

In [0]:
## train the CNN model ##
X_train_reshaped=X_train.reshape(len(X_train),12,1,1)
CNNmodel.fit(X_train_reshaped, y_train, epochs=30,batch_size=64)

## test the CNN model ##
X_test_reshaped=X_test.reshape(-1,12,1)
eval_model=CNNmodel.evaluate(X_train_reshaped, y_train)

y_pred=CNNmodel.predict(X_test_reshaped)
y_pred_binarized =(y_pred>0.5)
print ('F1-score ',f1_score(y_true=y_test, y_pred=y_pred_binarized, average='micro'))
print ('Precision score', precision_score(y_true=y_test,y_pred=y_pred_binarized,average='micro'))
print ('Recall score', recall_score(y_true=y_test,y_pred=y_pred_binarized,average='micro'))
print ('Accuracy score', accuracy_score(y_true=y_test, y_pred=y_pred_binarized))

## The Baseline Model (BiLSTM):

In [0]:
BiLSTM_model = Sequential()
n_timesteps=10

BiLSTM_model.add(Bidirectional(LSTM(64, return_sequences=True),input_shape=(12,1)))
BiLSTM_model.add(GlobalMaxPooling1D())
BiLSTM_model.add(Dense(32, activation="relu"))
BiLSTM_model.add(Dropout(0.25))
BiLSTM_model.add(Dense(4, activation="softmax"))

BiLSTM_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Evaluate the Bi-LSTM Model:

In [0]:
## train the BiLSTM model ##
X_train_reshaped=X_train.reshape(-1,12,1)

BiLSTM_model.fit(X_train_reshaped,y_train,epochs=50,batch_size=64)

## test the BiLSTM model ##
X_test_reshaped=X_test.reshape(-1,12,1)
eval_model=BiLSTM_model.evaluate(X_train_reshaped, y_train)
y_pred=BiLSTM_model.predict(X_test_reshaped)

y_pred_binarized =(y_pred>0.5)

print ('F1-score ',f1_score(y_true=y_test, y_pred=y_pred_binarized, average='micro'))
print ('Precision score', precision_score(y_true=y_test,y_pred=y_pred_binarized,average='micro'))
print ('Recall score', recall_score(y_true=y_test,y_pred=y_pred_binarized,average='micro'))
print ('Accuracy score', accuracy_score(y_true=y_test, y_pred=y_pred_binarized))

# The JointModel (BiLSTM+CNN):

## Load TED dataset (typhoon environmental data and tweets)

In [0]:
TED_dataset=np.load('TED.npy', allow_pickle=True)

# split the dataset into typhoon environmental data and typhoon tweets
Typhoon_Env=TED_dataset[:,0]
Typhoon_tweets=TED_dataset[:,1]

Typhoon_groundTruth=[]
Typhoon_features=[]

for i in range(len(Typhoon_Env)):
  Typhoon_groundTruth.append(Typhoon_Env[i][0])

  # remove typhoon label (at index 0) and timstamp (at last index) from typhoon features and add tweets count
  feat_vect=Typhoon_Env[i][1:-1]+[len(Typhoon_tweets[i])]
  Typhoon_features.append(feat_vect)

Typhoon_features=np.array(Typhoon_features,dtype='float32')

In [0]:
Typhoon_tweets=np.zeros((len(Typhoon_tweets),100, maxSeqLength), dtype=int)

i=0
for tweet in Typhoon_tweets1:
  typhoon_twt=np.array(tweet)
  typhoon_twt.resize(100,maxSeqLength)

  Typhoon_tweets4[i]=typhoon_twt
  i+=1

### Encode Typhoon Classes into One-hot:

In [0]:
label_encoder=LabelEncoder()
Typhoon_labels_encoded=label_encoder.fit_transform(Typhoon_groundTruth)

Typhoon_labels_encoded = to_categorical(Typhoon_labels_encoded,dtype='int')

### Split TED data into train, test splits:

In [0]:
split_frac = 0.8
split_idx = int(len(Typhoon_features)*split_frac)

Typhoon_train_x, Typhoon_val_x = Typhoon_features[:split_idx], Typhoon_features[split_idx:]
TyphoonLabels_train_y, TyphoonLabels_val_y = Typhoon_labels_encoded[:split_idx], Typhoon_labels_encoded[split_idx:]

In [0]:
Typhoon_tweets_x, Typhoon_tweets_y = Typhoon_tweets[:split_idx], Typhoon_tweets[split_idx:]

## The JointModel (BiLSTM+CNN):

In [0]:
# load pretrained weights
featExtractor_model=create_featExtractor_Model()
featExtractor_model.load_weights('./feat_Extractor_weights_.h5')

In [0]:
def create_jointModel():
  # load pretrained weights
  featExtractor_model=create_featExtractor_Model()
  featExtractor_model.load_weights('./feat_Extractor_weights_.h5')

  typhoon_env=Input(shape=(13,)) # 13 : features of typhoon environmental data
  concatenated_feat=concatenate([featExtractor_model.output,typhoon_env], axis=-1)

  concatenated_feat=Reshape((15,1))(concatenated_feat) # 15: final dimension of 13 (env)+2 (feature_extractorModel)
  
  typhoon_Conv1D1=Conv1D(32, kernel_size=3, activation='relu')(concatenated_feat)
  dropout_layer1=Dropout(0.3)(typhoon_Conv1D1)

  typhoon_Conv1D2=Conv1D(16, kernel_size=3, activation='relu')(dropout_layer1)
  dropout_layer2=Dropout(0.2)(typhoon_Conv1D2)

  maxPooling1D_layer=MaxPooling1D(pool_size=(8))(dropout_layer2)
  flatted_layer=Flatten()(maxPooling1D_layer)
  typhoon_predictions=Dense(4, activation='softmax')(flatted_layer)

  jointModel=Model(inputs=typhoon_input,outputs=[typhoon_predictions,featExtractor_model.output])

  featExtractor_model.trainable=True # set the feature_Extractor model trainable with the loss in typhoon_Classifier Model

  jointModel.compile(optimizer='adam', loss=['categorical_crossentropy','categorical_crossentropy'], loss_weights=[1,1] , metrics=['accuracy'])

  return jointModel

### Train and evaluate the JointModel:

In [0]:
jointModel=create_jointModel()
## train the jointModel (BiLSTM+CNN) ##
jointModel.fit([X_train_reshaped,X_train], [y_train,y_train], epochs=30,batch_size=64)

## test the jointModel ##
y_pred=typhoon_Classifer.predict([Typhoon_tweets1_y,Typhoon_val_x])
y_pred_binarized =(y_pred>0.5)

print ('F1-score ',f1_score(y_true=TyphoonLabels_val_y, y_pred=y_pred_binarized, average='micro'))
print ('Precision score', precision_score(y_true=TyphoonLabels_val_y,y_pred=y_pred_binarized,average='micro'))
print ('Recall score', recall_score(y_true=TyphoonLabels_val_y,y_pred=y_pred_binarized,average='micro'))
print ('Accuracy score', accuracy_score(y_true=TyphoonLabels_val_y, y_pred=y_pred_binarized))