In [42]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential ,model_from_json
from tensorflow.keras.layers import Embedding,Dense,Dropout ,GlobalMaxPool1D

# Pre-Processing Data

In [43]:
from sklearn.base import TransformerMixin ,BaseEstimator
class Extractor(BaseEstimator,TransformerMixin):
    def transform(self,X,y=None):
        return pd.read_csv('combined.csv', index_col=0)

class Cleaner(BaseEstimator,TransformerMixin):
    def transform(self,X,y=None):
        columns=X.columns.tolist()
        X.columns=[column.strip() for column in columns]
        X=X.drop('tweet id',axis=1)
        X=X.dropna()
        X['tweet']=X['tweet'].str.replace('@', '')
        X['tweet']=X['tweet'].str.replace('#', '')
        X['tweet']=X['tweet'].str.replace('http\S+', '',regex=True)
        X['tweet']=X['tweet'].str.strip()
        X['tweet']=X['tweet'].str.lower()        
        return X

  

In [44]:
class Sampler(BaseEstimator,TransformerMixin):
    def __init__(self, unrelated_size=None ,unrelated_ignore=True):
        self.unrelated_size = unrelated_size
        self.unrelated_ignore = unrelated_ignore
        
    def transform(self,X,y=None):
        Xnew = X.sample(frac=1).reset_index(drop=True)

        if self.unrelated_ignore:
          Xnew['label']=Xnew.apply(lambda row: row['category'] if 'on-topic' in row['label'] else 'unrelated',axis=1 ) 
        else:
          Xnew['label']=Xnew.apply(lambda row: row['category'] if 'on-topic' in row['label'] else 'unrelated_'+row['category'],axis=1 )  
        
        related,unrelated =self.equal_split(Xnew)
        Xmerged = pd.DataFrame()
        Xmerged = Xmerged.append(related)
        Xmerged = Xmerged.append(unrelated)
        X=Xmerged.drop('category',axis=1)       
        return X
            
    
    def equal_split(self,X):
        related=X[X['label'].str.contains('unrelated')==False]
        unrelated=X[X['label'].str.contains('unrelated')]

        cat = pd.DataFrame(X['label'].value_counts())
        cat = cat.drop('unrelated', axis=0)
        avg = int(cat['label'].mean())

        if self.unrelated_size is None:
          self.unrelated_size = avg
        if self.unrelated_size < unrelated.shape[0]:
          unrelated = unrelated[:self.unrelated_size]

        return related,unrelated  


In [45]:
class TextTokenizer(BaseEstimator,TransformerMixin):
    
    def __init__(self,pad_sequences,num_words=10000,max_length=100,max_pad_length=100 ):
        self._num_words=num_words
        self.max_length=max_length
        self._tokenizer=None
        self._pad_sequences=pad_sequences
        self._max_pad_length=max_pad_length
        self.vocab_size=None
        self.tokenizer=None
        
    def transform(self,X,y=None):
        self.tokenizer,self.vocab_size=self._get_tokenizer(X['tweet'])
        X['tweet_encoded']=self.tokenizer.texts_to_sequences(X['tweet'])
        X['tweet_encoded']= X['tweet_encoded'].apply(lambda x: self._pad_sequences([x],maxlen=self._max_pad_length ,padding='post')[0])
        return X
        
    def _get_tokenizer(self,X):
        tokenizer=tf.keras.preprocessing.text.Tokenizer(num_words=self._num_words)
        tokenizer.fit_on_texts(X)
        vocab_size=len(tokenizer.word_index)+1
        return tokenizer,vocab_size

In [46]:
class LabelOneHotEncoder(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.label_encoder=None
        self.one_hot=None
        
    def transform(self,X,y=None):
        self.label_encoder=LabelEncoder().fit(X['label'])
        self.one_hot=to_categorical
        num_classes=len(set(X['label']))
        X['label_encoded']= self.label_encoder.transform(X['label'].values)
        X['label_one_hot']= X['label_encoded'].apply(lambda x: self.one_hot([x],num_classes=num_classes)[0])   
        
        return X
      

In [47]:
class PassThrough(BaseEstimator,TransformerMixin):
    def transform(self,X,y=None):
        return X
    def fit(self,X,y=None):
        return X

In [48]:
padded = tf.keras.preprocessing.sequence.pad_sequences


pipeline =Pipeline(steps=[
    ('extractor',Extractor()),
    ('cleaner',Cleaner()),
    ('distribution-validator',Sampler(unrelated_size=None ,unrelated_ignore=True)),
    ('tokenizer',TextTokenizer(padded)),
    ('one-hot-encoder',LabelOneHotEncoder()),
    ('pass-through',PassThrough()),
    
])
processed_output = pipeline.transform(None)
processed_output.head()

Unnamed: 0,tweet,label,tweet_encoded,label_encoded,label_one_hot
0,"mt newstalk770: all but 3 in hillhurst, all br...",floods,"[520, 4385, 27, 70, 130, 3, 9145, 27, 5482, 20...",3,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]"
6,bostonmarathon joeymcintyre this is by no mean...,bombing,"[149, 7761, 16, 10, 29, 74, 1376, 6, 158, 171,...",0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
7,dominicans r like hurricane?! we swim in river...,hurricane,"[266, 73, 12, 41, 3229, 3, 1137, 8, 806, 83, 2...",4,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
10,“michaelskolnik: oklahoma senator tom coburn s...,tornado,"[24, 2111, 2026, 3511, 145, 3703, 1894, 565, 3...",5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
11,"okang readying the vehicles, and troops to sen...",tornado,"[1, 2540, 7, 1128, 4, 323, 4, 110, 3, 6, 898, ...",5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"


# Model Training

In [50]:
X_train, X_test, y_train, y_test =train_test_split(processed_output['tweet_encoded'],processed_output['label_one_hot'],test_size=0.3,stratify=processed_output['label_encoded'])
X_train, y_train = np.array(X_train.values.tolist()), np.array(y_train.values.tolist())
X_test, y_test = np.array(X_test.values.tolist()), np.array(y_test.values.tolist())

In [59]:
tokenizer = pipeline.named_steps['tokenizer']
max_length = tokenizer.max_length
vocab_size = tokenizer.vocab_size
embedding_dim = 50
num_classes= y_train[0].shape[0]

In [52]:
model=Sequential([
     Embedding(input_dim=vocab_size,output_dim=embedding_dim,input_length=max_length),
     GlobalMaxPool1D(),
     Dropout(0.2),
     Dense(10,activation='relu'),
     Dropout(0.2),
     Dense(num_classes,activation='softmax')      
])

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'] )
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 50)           2297750   
                                                                 
 global_max_pooling1d (Globa  (None, 50)               0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 10)                510       
                                                                 
 dropout_1 (Dropout)         (None, 10)                0         
                                                                 
 dense_1 (Dense)             (None, 7)                 77        
                                                        

In [53]:
model.fit(X_train,y_train,epochs=4,batch_size=10,validation_split=0.2)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f5c54b66790>

In [54]:
with open('tweets_model','w+') as f:
  f.write(model.to_json())
  model.save_weights('tweets_model.h5')

# Loading Model and Predictions

In [55]:
f = open('tweets_model', 'r')
model = model_from_json(f.read())
f.close()
model.load_weights('tweets_model.h5')

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
score = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

accuracy: 92.86%


# Getting Twitter data

In [56]:
import tweepy


api_key = "9Dco5KZZ8lxl7NmFELl8pw10L"
api_key_secret = "Nk3rpobh2QnzSBtGI0NkQcIZNgpzqgEhKtWvyLNPJOsePwwaun"

access_token = "1188450720032231425-TZsQDD6u4Ajwpba2NaELZJTcQqnBtA"
access_token_secret = "vpqrTMtcva9dq1bvgxtDZUfoT3ePa5sFKtk9WCGrz920p"

# authenticate
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [57]:
search_query = ["flood", "earthquake", "hurricane", "tornado", "explosion", "bombing", "wildfire" ]

tweets_copy = []
for query in search_query:

  tweets = tweepy.Cursor(api.search,
                q=query,
                lang="en",
                since="2020-09-16").items(50)


  for tweet in tweets:
      tweets_copy.append(tweet)
    
print("Total Tweets fetched:", len(tweets_copy))

Total Tweets fetched: 350


#Tweets Prediction

In [63]:
vocab_size=pipeline.named_steps['tokenizer'].vocab_size
max_length=pipeline.named_steps['tokenizer'].max_length
label_encoder=pipeline.named_steps['one-hot-encoder'].label_encoder
tokenizer=pipeline.named_steps['tokenizer'].tokenizer
max_length=pipeline.named_steps['tokenizer'].max_length

In [65]:
for i in range(20):
  x=[tweets_copy[i]._json['text']]

  x_seq=tokenizer.texts_to_sequences(x)[0]
  x_pad=tf.keras.preprocessing.sequence.pad_sequences([x_seq],maxlen=max_length ,padding='post')[0]
  x_pad=np.array(x_pad)
  x_pad=x_pad.reshape(1,100)
  x_pad.shape
  predict=model.predict(x_pad)[0].tolist()
  score=max(model.predict(x_pad)[0])
  print(predict.index(score))
  print(label_encoder.inverse_transform([predict.index(score)]))
  print('score', score)

3
['floods']
score 0.9991104
3
['floods']
score 0.9995461
3
['floods']
score 0.99898225
3
['floods']
score 0.9992698
6
['unrelated']
score 0.8477903
3
['floods']
score 0.9993691
3
['floods']
score 0.999713
3
['floods']
score 0.9995461
6
['unrelated']
score 0.94782984
3
['floods']
score 0.9988261
3
['floods']
score 0.99898225
3
['floods']
score 0.9993316
3
['floods']
score 0.9995461
3
['floods']
score 0.99898225
6
['unrelated']
score 0.92994875
3
['floods']
score 0.99901927
3
['floods']
score 0.99898225
3
['floods']
score 0.99957865
3
['floods']
score 0.99908805
3
['floods']
score 0.9995461
