In [1]:
# to prevent to restart kernel when any changes are made to any imported file
%reload_ext autoreload
%autoreload 2

# to import any file from some other directory

# to stop printing warnings
import warnings
warnings.filterwarnings('ignore')
def warn(*args, **kwargs):
    pass
warnings.warn = warn
    
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_colwidth', -1)

# to increase cells width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# to enable collapsible Headings and Functions
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install --user
# !jupyter nbextensions_configurator enable --user
# !jupyter nbextension enable codefolding/main
# search collapsible to enable

# enable dark theme
# !pip install jupyterthemes
# !jt -t monokai

# monokai
# solarizedd
# !jt -r
import numpy as np
import pandas as pd
import sys, os
from datetime import datetime, timedelta
import requests
import json
import timeit
from datetime import datetime
import os.path, time
from datetime import datetime, timedelta
from threading import Timer
import pdb, random

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import fasttext
import string








In [2]:
!pwd

PATH="/home/cloud_user/faisal/experiments/real_not_real_kaggle/data1/"

/home/cloud_user/faisal/experiments/100_day_ml


# Get Data

In [3]:
train=pd.read_csv(f"{PATH}train.csv")
test=pd.read_csv(f"{PATH}test.csv")
train.shape, test.shape

((7613, 5), (3263, 4))

In [4]:
df=train.copy()
df.shape

(7613, 5)

In [5]:
df.head(1)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1


In [6]:
def preprocess(mydf, Xcol, ycol):
    mydf[Xcol] = mydf[Xcol].apply(lambda x: ((x.encode("unicode_escape").decode("utf-8"))))
    mydf[Xcol] = mydf[Xcol].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
    mydf[Xcol] = mydf[Xcol].str.lower().str.replace('\n',' ')
    mydf[ycol] = mydf[ycol].astype(str)
    return mydf

In [7]:
def save_file(df, fname, col):
    f = open(fname,'w')
          
    f.write('\n'.join(df[col].tolist()))
    f.close()
    
    return True

In [8]:
def to_fasttext_syntax(mydf, Xcol, ycol, fasttext_txt):
    mydf[fasttext_txt] = '__label__'+mydf[ycol]+' '+mydf[Xcol]
    return mydf

In [9]:
def split_random(mydf, Xcol, ycol, fasttext_txt):
    df=preprocess(mydf, Xcol, ycol)
    trn_df, vld_df, ts_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
    trn_df, vld_df=to_fasttext_syntax(trn_df, Xcol, ycol, fasttext_txt), to_fasttext_syntax(vld_df, Xcol, ycol, fasttext_txt)
    print("Train: ", trn_df.shape); print("Valid: ", vld_df.shape); print("Test: ", ts_df.shape)
    return trn_df, vld_df, ts_df


def split_dt(mydf, Xcol, ycol, fasttext_txt, trn_split, vld_split):
    df=preprocess(mydf, Xcol, ycol)
    trn_df = df.loc[df.data<=trn_split]
    vld_df = df.loc[df.data<=vld_split]
    ts_df  = df.loc[df.data>vld_split]
    trn_df, vld_df=to_fasttext_syntax(mydf, Xcol, ycol, fasttext_txt), to_fasttext_syntax(mydf, Xcol, ycol, fasttext_txt)
    print("Train: ", trn_df.shape); print("Valid: ", vld_df.shape); print("Test: ", ts_df.shape)
    return trn_df, vld_df, ts_df


In [10]:
Xcol="text"
ycol="target"
fasttext_txt="ft_txt"

# random split
trn_df, vld_df, ts_df=split_random(df, Xcol, ycol, fasttext_txt)

# split on date
# trn_split="2019-08-31";vld_split="2019-10-31"
# trn_df, vld_df, ts_df=split_dt(mydf, Xcol, ycol, fasttext_txt, trn_split, vld_split)

Train:  (4567, 6)
Valid:  (1523, 6)
Test:  (1523, 5)


In [58]:
def get_results(test_df, classifier, Xcol, ycol):
    y_pred=[]
    y_true=[]
    for txt, lbl in zip(test_df[Xcol], test_df[ycol]):
        y_pred.append(classifier.predict([txt])[0][0][0][9:])
        y_true.append(lbl)

    y_pred = np.array(y_pred, dtype='O')
    y_true = np.array(y_true, dtype='O')
    conf_mat=confusion_matrix(y_true, y_pred)
    cls_rpt=classification_report(y_true, y_pred)
    print(conf_mat)
    print(cls_rpt)
    print(f"\n =================== \n")
    return conf_mat, cls_rpt
    

In [59]:
def classify_fasttext(trn_df, vld_df, ts_df, fasttext_txt, autotune_label):
    results=[]
    
    trn_fn, vld_fn = "df.train", "df.valid"
    
    _, _=save_file(trn_df, trn_fn, fasttext_txt), save_file(vld_df, vld_fn, fasttext_txt)
    
    classifier1 = fasttext.train_supervised(trn_fn, loss='softmax')
    print("\nclassifier1: ")
    conf_mat1, cls_rpt1 = get_results(ts_df, classifier1, Xcol, ycol)
    results.append((conf_mat1, cls_rpt1))
    
    classifier2 = fasttext.train_supervised(input=trn_fn, autotuneValidationFile=vld_fn, autotuneMetric=f"f1:__label__{autotune_label}")
    print("classifier2: ")
    conf_mat2, cls_rpt2 = get_results(ts_df, classifier2, Xcol, ycol)
    results.append((conf_mat2, cls_rpt2))

    classifier3 = fasttext.train_supervised(input=trn_fn, autotuneValidationFile=vld_fn, autotuneDuration=60)
    print("classifier3: ")
    conf_mat3, cls_rpt3 = get_results(ts_df, classifier3, Xcol, ycol)
    results.append((conf_mat3, cls_rpt3))

    
    return results


In [None]:
results = classify_fasttext(trn_df, vld_df, ts_df, fasttext_txt, "1")


classifier1: 
[[745 114]
 [195 469]]
              precision    recall  f1-score   support

           0       0.79      0.87      0.83       859
           1       0.80      0.71      0.75       664

   micro avg       0.80      0.80      0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



