In [None]:
from __future__ import division
import numpy as np
import pandas as pd
from sklearn.feature_selection import f_classif,SelectKBest
import emoji
from numpy import newaxis
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import TweetTokenizer,word_tokenize
from nltk.corpus import stopwords
import tensorflow
import keras
from keras.models import load_model
import re
stop_words=list(set(stopwords.words("english")))
stop_words=[word.lower() for word in stop_words]
tokenizer=TweetTokenizer()

In [None]:
def preprocessing(sent):
#     s=''
    # sent=re.sub(r'#','<hashtag>',df['text'][i])
    sent=emoji.demojize(sent)
    sent=sent.lower()
    sent=re.sub(r'[^a-zA-Z0-9]',' ',sent)
    sent=re.sub(r'[\s]+',' ',sent)
    s=''
    words=[word for word in word_tokenize(sent) if word not in stop_words]
    for word in words:
        if(len(s)==0):
            s+=word
        else:
            s+=" "+word
    sent=s
    # sent=sent.lower()
#     sent=re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)','<URL>',sent)  #Replace url by <URL>
#     sent=re.sub(r'@RT\[[a-zA-Z]+\]|RT@\[[a-zA-Z]]',' ',sent) 
#     sent=re.sub(r'@\[[a-zA-Z]+\]','<NAME>',sent)
#     # sent=''.join(ch if ord(ch)<128 else '' for ch in sent)
#     sent=re.sub(r'&','<AND>',sent)
#     sent=re.sub(r',',' , ',sent)
#     sent=re.sub(r'\.',' . ',sent)
#     sent=re.sub(r"'"," ' ",sent)
#     sent=re.sub(r'-',' - ',sent)
#     sent=re.sub(r'[\...]{3}',' ... ',sent)
#     sent=re.sub(r'!',' ! ',sent)
#     sent=re.sub(r'\?',' ? ',sent)
#     sent=re.sub(r'[0-9]','<number>',sent)
#     sent=re.sub(r"i'm","i am",sent)
#     sent=re.sub(r"they'are","they are",sent)
#     sent=re.sub(r"won't","would not",sent)
#     sent=re.sub(r"you're","you are",sent)
#     sent=re.sub(r"who're","who are",sent)
#     sent=re.sub(r"they've","they have",sent)
#     sent=re.sub(r'[\s]+',' ',sent)
    return sent

In [None]:
def read_file(file):
    df=pd.read_csv(file)
    df=df[['text','target']]
    sentences=[]
    for sent in df['text']:
        sentences.append(preprocessing(sent))
    num_classes=len(df['target'].unique())
    return df,sentences,num_classes

In [None]:
def get_num_words_per_sample(text):
    words=[len(set(tokenizer.tokenize(s))) for s in text]
    return np.median(words)

In [None]:
def choose_model(sentences):
    s=len(sentences)
    w=get_num_words_per_sample(sentences)
    if(s//w<1500):
        print("MLP is suitable")
    else:
        print("Deep learning is suitable")

In [None]:
def split_dataset(sentences,df):
    X_train,X_test,Y_train,Y_test=train_test_split(sentences,df['target'],test_size=0.2,shuffle=True,random_state=42)
    X_train,X_dev,Y_train,Y_dev=train_test_split(X_train,Y_train,test_size=0.2,shuffle=True,random_state=42)
    return X_train,Y_train,X_dev,Y_dev,X_test,Y_test

In [None]:
def encoding_label(train_labels,dev_labels,test_labels):
    le=LabelEncoder()
    le=le.fit(train_labels)
    train_labels=le.transform(train_labels)
    dev_labels=le.transform(dev_labels)
    test_labels=le.transform(test_labels)
    return train_labels,dev_labels,test_labels

In [None]:
#Vectorize data
ngram_range=(1,2)
top_k=20000
token_mode='word'
min_document_frequency=2
max_sequence_length=500
def vectorize_ngram(train_data,train_labels,dev_data,test_data):
    vectorizer=TfidfVectorizer(ngram_range=ngram_range,dtype='int32',strip_accents='unicode',decode_error='replace',analyzer=token_mode,min_df=min_document_frequency)
    train_data=vectorizer.fit_transform(train_data)
    dev_data=vectorizer.transform(dev_data)
    test_data=vectorizer.transform(test_data)
    selector=SelectKBest(f_classif,k=min(top_k,train_data.shape[1]))
    selector.fit(train_data,train_labels)
    train_data=selector.transform(train_data)
    dev_data=selector.transform(dev_data)
    test_data=selector.transform(test_data)
    train_data=train_data.astype('float32')
    dev_data=dev_data.astype('float32')
    test_data=test_data.astype('float32')
    return train_data,dev_data,test_data,selector,vectorizer

In [None]:
def convert_non_numeric_to_numeric(X_train,Y_train,X_dev,Y_dev,X_test,Y_test):
    Y_train,Y_dev,Y_test=encoding_label(Y_train,Y_dev,Y_test)
    X_train,X_dev,X_test,selector,vectorizer=vectorize_ngram(X_train,Y_train,X_dev,X_test)
    return X_train,Y_train,X_dev,Y_dev,X_test,Y_test,selector,vectorizer

In [None]:
def get_last_layer_units_activation(num_classes):
    if(num_classes==2):
        activation='sigmoid'
        units=1
    else:
        activation='softmax'
        units=num_classes
    return activation,units
def mlp_model(layers,units,dropout_rate,input_shape,num_classes):
    output_activation,output_units=get_last_layer_units_activation(num_classes)
    model=keras.models.Sequential()
    model.add(keras.layers.Dropout(rate=dropout_rate,input_shape=input_shape))
    for _ in range(layers-1):
        model.add(keras.layers.Dense(units=units,activation='relu'))
        model.add(keras.layers.Dropout(rate=dropout_rate))
    model.add(keras.layers.Dense(units=output_units,activation=output_activation))
    return model

In [None]:
def train_model(train_text,train_label,dev_text,dev_label,layers=2,units=32,dropout_rate=0.2,epochs=150,learning_rate=0.00001,num_classes=2,batch_size=4):
    model=mlp_model(layers=layers,units=units,dropout_rate=dropout_rate,input_shape=train_text.shape[1:],num_classes=num_classes)
    if(num_classes==2):
        loss='binary_crossentropy'
    else:
        loss='sparse_categorical_crossentropy'
    optimizer=keras.optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=optimizer,loss=loss,metrics=['acc'])
#     callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',patience=3)]
    history=model.fit(train_text,train_label,epochs=epochs,validation_data=(dev_text,dev_label),batch_size=batch_size,verbose=2)
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_acc'][-1], loss=history['val_loss'][-1]))
    model.save('disaster_tweets.h5')
    return history['val_acc'][-1], history['val_loss'][-1]

In [None]:
def evaluating_algorithm(Y,Y_pred):
    matrix=confusion_matrix(Y,Y_pred)
    p=precision_score(Y,Y_pred)
    r=recall_score(Y,Y_pred)
    f1=f1_score(Y,Y_pred)
    return matrix,p,r,f1

In [None]:
def classify_disaster_tweets(file):
    df,sentences,no_of_classes=read_file(file)
    choose_model(sentences)
    X_train,Y_train,X_dev,Y_dev,X_test,Y_test=split_dataset(sentences,df)
    X_train,Y_train,X_dev,Y_dev,X_test,Y_test,selector,vectorizer=convert_non_numeric_to_numeric(X_train,Y_train,X_dev,Y_dev,X_test,Y_test)
    train_model(X_train,Y_train,X_dev,Y_dev)
    model=load_model('disaster_tweets.h5')
    model.evaluate(X_test,Y_test)
    Y_pred=model.predict_classes(X_test)
    matrix,p,r,f1=evaluating_algorithm(Y_test,Y_pred)
    print("Confusion Matrix:-\n",matrix)
    print("Precision:-\n",p)
    print("Recall:-\n",r)
    print("F1 score:-\n",f1)
    return model,selector,vectorizer

In [None]:
model,selector,vectorizer=classify_disaster_tweets('train.csv')

In [None]:
def test_unknown_dataset(file,vectorizer,selector,model):
    df_test=pd.read_csv(file)
    sentences=[]
    for sent in df_test['text']:
        sentences.append(preprocessing(sent))
    sentences=vectorizer.transform(sentences)
    sentences=selector.transform(sentences)
    sentences=sentences.astype('float32')
    test_result=model.predict_classes(sentences)
    submission={
    'text':[],
    'target':[]
    }
    for i in range(0,len(df_test)):
        submission['text'].append(df_test['text'][i])
        submission['target'].append(test_result[i][0])
    return submission

In [None]:
submission=test_unknown_dataset('test.csv',vectorizer,selector,model)