In [9]:
# to prevent to restart kernel when any changes are made to any imported file
%matplotlib inline
%reload_ext autoreload
%autoreload 2

# to import any file from some other directory

# to stop printing warnings
import warnings
warnings.filterwarnings('ignore')
def warn(*args, **kwargs):
    pass
warnings.warn = warn
    
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 20000)
pd.set_option('display.max_colwidth', -1)

# to increase cells width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# to enable collapsible Headings and Functions
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install --user
# !jupyter nbextensions_configurator enable --user
# !jupyter nbextension enable codefolding/main
# search collapsible to enable

# enable dark theme
# !pip install jupyterthemes
# !jt -t monokai

# monokai
# solarizedd
# !jt -r
import numpy as np
import pandas as pd
import sys, os
from datetime import datetime, timedelta
import requests
import json
import timeit
from datetime import datetime
import os.path, time
from datetime import datetime, timedelta
from threading import Timer
import pdb, random

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import fasttext
import string
from scipy.stats import hmean
from sklearn.model_selection import TimeSeriesSplit
from random import seed
from random import random
from random import randrange
from sklearn.model_selection import train_test_split

In [10]:
!pwd

PATH="/home/cloud_user/faisal/experiments/tweets_sentiments/"

/home/cloud_user/faisal/experiments/100_day_ml


In [11]:
!ls {PATH}

Tweets.csv


# Get Data

In [12]:
data=pd.read_csv(f"{PATH}Tweets.csv")
print(data.shape)

data.head(2)

(14640, 15)


Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)


In [13]:
df=data.reset_index(drop=True).copy()
df.tweet_created=pd.to_datetime(df.tweet_created)
df=df.rename(columns={"airline_sentiment": "target", "tweet_created": "date"})

df.shape

(14640, 15)

# Util Functions

In [14]:
def preprocess(mydf, Xcol, ycol):
    mydf[Xcol] = mydf[Xcol].apply(lambda x: ((x.encode("unicode_escape").decode("utf-8"))))
    mydf[Xcol] = mydf[Xcol].apply(lambda x: x.translate(str.maketrans('','',string.punctuation)))
    mydf[Xcol] = mydf[Xcol].str.lower().str.replace('\n',' ')
    mydf[ycol] = mydf[ycol].astype(str)
    return mydf

In [15]:
def save_file(txts, fname):
    f = open(fname,'w')
    f.write('\n'.join(txts))
    f.close()
    
    return True

In [16]:
def to_fasttext_syntax(mydf, Xcol, ycol, fasttext_txt):
    mydf[fasttext_txt] = '__label__'+mydf[ycol]+' '+mydf[Xcol]
    return mydf

In [17]:
def split_random(mydf, Xcol, ycol, fasttext_txt):
    df=preprocess(mydf, Xcol, ycol)
    trn_df, vld_df, ts_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])
    trn_df, vld_df=to_fasttext_syntax(trn_df, Xcol, ycol, fasttext_txt), to_fasttext_syntax(vld_df, Xcol, ycol, fasttext_txt)
    print("Train: ", trn_df.shape); print("Valid: ", vld_df.shape); print("Test: ", ts_df.shape)
    return trn_df, vld_df, ts_df




In [18]:
def split_dt(mydf, Xcol, ycol, fasttext_txt, vld_split, ts_split=None):
    df=preprocess(mydf, Xcol, ycol)
    
    if ts_split:
        trn_df = df.loc[df.date<=vld_split]
        vld_df = df.loc[(df.date>=ts_split) & (df.date<=ts_split)]
        ts_df  = df.loc[df.date>ts_split]
        trn_df, vld_df=to_fasttext_syntax(trn_df, Xcol, ycol, fasttext_txt), to_fasttext_syntax(vld_df, Xcol, ycol, fasttext_txt)
        print("Train: ", trn_df.shape); print("Valid: ", vld_df.shape); print("Test: ", ts_df.shape)
        return trn_df, vld_df, ts_df
    else:
        trn_df = df.loc[df.date<=vld_split]
        vld_df = df.loc[(df.date>vld_split)]
        trn_df=to_fasttext_syntax(trn_df, Xcol, ycol, fasttext_txt)
        print("Train: ", trn_df.shape); print("Valid: ", vld_df.shape)
        return trn_df, vld_df



In [19]:
def get_classification_report(y_true, y_pred):
    report=classification_report(y_true, y_pred, output_dict=True)
    report=pd.DataFrame(report).T.round(2)
    report.loc["harmonic avg"] = report.iloc[:-3, :].apply(lambda x: round(hmean(x), 2), axis=0)
    return report


In [20]:
def get_results(test_df, classifier, Xcol, ycol):
    y_pred=[]
    y_true=[]
    for txt, lbl in zip(test_df[Xcol], test_df[ycol]):
        y_pred.append(classifier.predict([txt])[0][0][0][9:])
        y_true.append(lbl)

    y_pred = np.array(y_pred, dtype='O')
    y_true = np.array(y_true, dtype='O')
    conf_mat=confusion_matrix(y_true, y_pred)
    cls_rpt=get_classification_report(y_true, y_pred)
    print(conf_mat)
    print("\n")
    print(cls_rpt)
    print(f"\n =================== \n")
    return conf_mat, cls_rpt
    

In [21]:
def classify_fasttext(trn_df, vld_df, ts_df, fasttext_txt, autotune_label):
    results={}
    
    trn_fn, vld_fn = "/tmp/df.train", "/tmp/df.valid"
    
    _, _=save_file(trn_df, trn_fn, fasttext_txt), save_file(vld_df, vld_fn, fasttext_txt)
    
    classifier1 = fasttext.train_supervised(trn_fn, loss='softmax')
    print("\nclassifier1: ")
    conf_mat1, cls_rpt1 = get_results(ts_df, classifier1, Xcol, ycol)
    results["classifier1"]=(classifier1,conf_mat1, cls_rpt1)
    
    classifier2 = fasttext.train_supervised(input=trn_fn, autotuneValidationFile=vld_fn, autotuneMetric=f"f1:__label__{autotune_label}")
    print("classifier2: ")
    conf_mat2, cls_rpt2 = get_results(ts_df, classifier2, Xcol, ycol)
    results["classifier2"]=(classifier2,onf_mat2, cls_rpt2)

    classifier3 = fasttext.train_supervised(input=trn_fn, autotuneValidationFile=vld_fn, autotuneDuration=60)
    print("classifier3: ")
    conf_mat3, cls_rpt3 = get_results(ts_df, classifier3, Xcol, ycol)
    results["classifier3"]=(classifier3,conf_mat3, cls_rpt3)
    
    return results


# Classification

In [22]:
Xcol="text"
ycol="target"
fasttext_txt="ft_txt"

# random split
# trn_df, vld_df, ts_df=split_random(df, Xcol, ycol, fasttext_txt)

# split on date, trn, vld, ts
# trn_split="2019-08-31";vld_split="2019-10-31"
# trn_df, vld_df, ts_df=split_dt(mydf, Xcol, ycol, fasttext_txt, trn_split, vld_split)

# split on date, trn, ts
split="2015-02-23"
trn_df, vld_df=split_dt(df, Xcol, ycol, fasttext_txt, split)


Train:  (10268, 16)
Valid:  (4372, 15)


In [23]:
# results = classify_fasttext(trn_df, vld_df, ts_df, fasttext_txt, "neutral")

## Select Best Classifier for Bagging

In [24]:
def subsample(df, Xcol, ycol, ratio=0.60):
    X_train, _, _, _ = train_test_split(df[Xcol], df[ycol], test_size=ratio, shuffle=True, stratify=df[ycol], random_state=42)
    
    return list(X_train)
    

In [25]:
def train_bag(df, fasttext_txt, ycol):
    trn_X_sample=subsample(df, fasttext_txt, ycol)
    trn_fn = "tmpdir/df_sample.train"
    _=save_file(trn_X_sample, trn_fn)
    
    
    classifier = fasttext.train_supervised(trn_fn, loss='softmax')
    return classifier
    

In [26]:
def get_preds(classifier, vld_df, Xcol, ycol, i):
    vld_df[f"pred_{i}"] = vld_df[Xcol].apply(lambda x: classifier.predict([x])[0][0][0][9:])
    vld_df[f"pred_{i}_correct"]=np.where(vld_df[f"pred_{i}"]==vld_df.target, True, False)
    
    return vld_df


In [27]:
n_folds=5

for i in range(1,n_folds+1):
    classifier=train_bag(trn_df.head(10), fasttext_txt, ycol)
    vld_df=get_preds(classifier, vld_df, Xcol, ycol, i)
    

In [28]:
vld_df.head(2)

Unnamed: 0,tweet_id,target,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,date,tweet_location,user_timezone,pred_1,pred_1_correct,pred_2,pred_2_correct,pred_3,pred_3_correct,pred_4,pred_4_correct,pred_5,pred_5_correct
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,virginamerica what dhepburn said,,2015-02-24 11:35:52-08:00,,Eastern Time (US & Canada),negative,False,negative,False,negative,False,negative,False,negative,False
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,virginamerica plus youve added commercials to the experience tacky,,2015-02-24 11:15:59-08:00,,Pacific Time (US & Canada),negative,False,negative,False,negative,False,negative,False,negative,False


In [29]:
vld_df["n_correct"]=vld_df[[f"pred_{i}_correct" for i in range(1,n_folds+1)]].sum(axis=1)
vld_df["n_incorrect"]=n_folds-vld_df.n_correct

In [30]:
cols=["airline", "val_counts", "n_incorrect"]

In [31]:
vld_counts=vld_df[cols[0]].value_counts().to_dict()
vld_df["val_counts"]=vld_df[cols[0]].map(vld_counts)


In [32]:
grp=pd.DataFrame(vld_df[cols].groupby(cols).size()).rename(columns={0:"counts"})
grp["ratios"]=(grp.counts / grp.index.get_level_values(1)).round(2)


In [33]:
grp

# i.e. for Airline america, we are predicting 35% of its values incorrect all the time for our 5 classifiers

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,counts,ratios
airline,val_counts,n_incorrect,Unnamed: 3_level_1,Unnamed: 4_level_1
American,1767,0,1082,0.61
American,1767,1,14,0.01
American,1767,2,8,0.0
American,1767,3,21,0.01
American,1767,4,23,0.01
American,1767,5,619,0.35
Delta,504,0,158,0.31
Delta,504,1,3,0.01
Delta,504,2,5,0.01
Delta,504,3,4,0.01
