# Data Filtering

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import zipfile
from datetime import datetime
from glob import glob

import boto3
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

In [3]:
PROJ_ROOT = os.path.join(os.pardir)
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [87]:
%aimport metrics_utils
from metrics_utils import calculate_metrics

%aimport s3_utils
from s3_utils import download_files_from_s3, upload_file_to_s3

## About

...

## User Inputs

In [17]:
# raw data on S3
path_to_folder = "/datasets/twitter/kinesis-demo/"

# processed data
processed_data_dir = "../data/processed"

label_mapper = {"does_not_need_support": 0, "needs_support": 1}

needs_support_labels = [0, 1]

model_output_dir = "../model-fine-tuned"

# Metadata - feature engineering
b = [0, 4, 8, 12, 16, 20, 24]
l = ["Late Night", "Early Morning", "Morning", "Afternoon", "Evening", "Night"]

upload_to_s3 = True
cleanup_local_files = True

In [6]:
s3_bucket_name = os.getenv("AWS_S3_BUCKET_NAME", "")

try:
    session = boto3.Session(profile_name="default")
    s3_client = session.client("s3")
    aws_region = session.region_name
    print("Retrieved AWS credentials from ~/.ssh/aws file")
except Exception as e:
    if str(e) == "The config profile (default) could not be found":
        aws_region = os.getenv("AWS_REGION")
        s3_client = boto3.client("s3", region_name=aws_region)
        print("Retrieved AWS credentials from .env file")

dtypes_dict = {
    "id": pd.StringDtype(),
    "contributors": pd.StringDtype(),  # pd.BooleanDtype(),
    "source_text": pd.StringDtype(),
    "place_country": pd.StringDtype(),
    "user_location": pd.StringDtype(),
    "user_followers": pd.Int32Dtype(),
    "user_friends": pd.Int32Dtype(),
    "user_listed": pd.Int32Dtype(),
    "user_favourites": pd.Int32Dtype(),
    "user_statuses": pd.Int32Dtype(),
    "user_protected": pd.StringDtype(),  # pd.BooleanDtype(),
    "user_verified": pd.StringDtype(),  # pd.BooleanDtype(),
    "is_quote_status": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted": pd.StringDtype(),  # pd.BooleanDtype(),
    "retweeted_tweet": pd.StringDtype(),
    "in_reply_to_screen_name": pd.StringDtype(),
    "user_screen_name": pd.StringDtype(),
    "num_urls_in_tweet_text": pd.Int32Dtype(),
    "num_words": pd.Int32Dtype(),
    "text": pd.StringDtype(),
    "sentiment": pd.Int32Dtype(),
    "order": pd.Int32Dtype(),
    "hour": pd.Int32Dtype(),
    "day": pd.Int32Dtype(),
    "weekday": pd.StringDtype(),
    "time_of_day": pd.StringDtype(),
    "batch_num": pd.Int32Dtype(),
}

id2label = {v: k for k, v in label_mapper.items()}
id2label

Retrieved AWS credentials from .env file


{0: 'does_not_need_support', 1: 'needs_support'}

## Get Hourly Combined Data

In [14]:
%%time
download_files_from_s3(
    s3_client,
    s3_bucket_name,
    processed_data_dir,
    aws_region,
    f"{path_to_folder[1:]}processed/nlp_splits/",
    ".xlsx",
)
proc_files = sorted(glob(f"{processed_data_dir}/*_nlp_*.xlsx"))
proc_file_inference = [f for f in proc_files if "test_" in f][0]
print(proc_file_inference)
proc_files

File found at ../data/processed/test_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
File found at ../data/processed/train_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
File found at ../data/processed/val_nlp__inference_starts_20220110_000000.xlsx. Did nothing.
../data/processed/test_nlp__inference_starts_20220110_000000.xlsx
CPU times: user 12.3 ms, sys: 221 µs, total: 12.5 ms
Wall time: 176 ms


['../data/processed/test_nlp__inference_starts_20220110_000000.xlsx',
 '../data/processed/train_nlp__inference_starts_20220110_000000.xlsx',
 '../data/processed/val_nlp__inference_starts_20220110_000000.xlsx']

In [12]:
%%time
download_files_from_s3(
    s3_client,
    s3_bucket_name,
    processed_data_dir,
    aws_region,
    f"{path_to_folder[1:]}processed/nlp_splits/test_",
    ".parquet.gzip",
)
infer_file_with_pred = sorted(glob(f"{processed_data_dir}/*.parquet.gzip"))[-1]
infer_file_with_pred

File found at ../data/processed/test_nlp__inference_starts_20220110_000000__batch_1__with_preds.parquet.gzip. Did nothing.
CPU times: user 16.8 ms, sys: 700 µs, total: 17.5 ms
Wall time: 190 ms


'../data/processed/test_nlp__inference_starts_20220110_000000__batch_1__with_preds.parquet.gzip'

In [15]:
%%time
df_infer_data_with_preds = pd.read_parquet(infer_file_with_pred)
with pd.option_context('display.max_columns', 1000):
    display(df_infer_data_with_preds.head(3))

Unnamed: 0,id,created_at,contributors,user_joined,source_text,place_country,user_location,user_followers,user_friends,user_listed,user_favourites,user_statuses,user_protected,user_verified,is_quote_status,retweeted,retweeted_tweet,in_reply_to_screen_name,user_screen_name,num_urls_in_tweet_text,num_words,text,labels,order,hour,day,weekday,time_of_day,batch_num,split,pred,error,created_at_hour,created_at_day,user_joined_hour,user_joined_day,created_at_time_of_day,country
0,1480304997506113541,2022-01-09 22:26:28,,2016-01-13 22:11:35,Twitter for iPhone,,"Providence, Rhode Island, USA",334,703,9,5978,3939,False,False,False,False,no,AshleyGWinter,thegraindoctor,0,25,My sister works at the Space Telescope Science...,0,1,22,9,Sunday,Night,1,test,0,False,22,Sunday,22,Wednesday,Night,USA
1,1480245603233845251,2022-01-09 18:30:28,,2021-10-29 14:28:41,Twitter for iPhone,,Gn-z11,394,757,3,237,544,False,False,False,False,no,,inputhumanname,0,33,If the new James Webb space telescope was plac...,0,1,18,9,Sunday,Evening,1,test,0,False,18,Sunday,14,Friday,Evening,Other
2,1480122742678736899,2022-01-09 10:22:15,,2008-02-28 08:59:25,Twitter Web App,,,94,2,8,1,5,False,False,False,False,no,elonmusk,azizkin,0,20,could you create more affordable alternatives ...,1,1,10,9,Sunday,Morning,1,test,1,False,10,Sunday,8,Thursday,Morning,Other


CPU times: user 17.7 ms, sys: 5.59 ms, total: 23.3 ms
Wall time: 20.5 ms


In [61]:
%%time
df_infer_data, df_train = [
    (
        pd.read_excel(
            f,
            dtype=dtypes_dict,
            usecols=list(dtypes_dict)+['created_at', 'user_joined']
        ).rename(columns={"sentiment": "labels"})
        # .sort_values(by=['created_at'])
        .query("~text.str.startswith('RT')")
        .assign(split=st)
        .assign(labels=lambda df: df['labels'].isin(needs_support_labels).astype(pd.Int32Dtype()))
        .assign(
            text=lambda df: (
                df["text"]
                .str.lstrip()
                .str.rstrip()
                .str.replace("&gt;", ">")
                .str.replace("&lt;", "<")
                .str.replace("&amp;", "&")
            )
        )
        .assign(created_at_hour=lambda df: df["created_at"].dt.hour)
        .assign(
            created_at_time_of_day=lambda df: pd.cut(
                df["created_at_hour"], bins=b, labels=l, include_lowest=True
            )
        )
    )
    for f, st in zip(proc_files[:-1], ['inference', 'train'])
]
df_infer_data = df_infer_data.dropna(subset=["labels"]).astype(
    {
        "split": pd.StringDtype(),
        "created_at_hour": pd.Int32Dtype(),
        "created_at_time_of_day": pd.StringDtype(),
    }
)

CPU times: user 1.45 s, sys: 18.6 ms, total: 1.47 s
Wall time: 1.47 s


In [62]:
split_sizes = [{"train": len(df_train), "inference": len(df_infer_data)}]
df_split_sizes = pd.DataFrame.from_records(split_sizes).assign(type="raw")

In [63]:
batch_nums = df_infer_data["batch_num"].unique().tolist()

In [64]:
# inference_current_batch_num = df_infer_data["batch_num"].max()
# if df_infer_data["batch_num"].nunique() > 1:
#     df_train = pd.concat(
#         [df_train, df_infer_data.query(f"batch_num < {inference_current_batch_num}")]
#     )
#     df_infer_data = df_infer_data.query(f"batch_num == {inference_current_batch_num}")

if len(batch_nums) > 1:
    # get all but last batch numbers from test split (to use in training split)
    training_batch_nums = batch_nums[:-1]
    # get last batch number from test split (to use as current test split)
    inference_current_batch_num = batch_nums[-1]

    # Slice raw data splits based on batch numbers defined above
    df_train = pd.concat(
        [df_train, df_infer_data.query(f"batch_num.isin(@training_batch_nums)")]
    )
    df_test = df_infer_data.query(f"batch_num == {test_current_batch_num}")
else:
    inference_current_batch_num = df_infer_data["batch_num"].max()

In [65]:
df_train = df_train.drop_duplicates(subset=["text"])
split_sizes_no_dups = [{"train": len(df_train), "inference": len(df_infer_data)}]
df_split_sizes_no_dups = pd.DataFrame.from_records(split_sizes_no_dups).assign(
    type="without-duplicates"
)
df_split_sizes_comp = pd.concat(
    [df_split_sizes, df_split_sizes_no_dups], ignore_index=True
)
df_split_sizes_comp

Unnamed: 0,train,inference,type
0,2931,600,raw
1,2775,600,without-duplicates


In [66]:
df_split_dates = pd.DataFrame.from_records(
    [
        {
            "split": split_type,
            "start": df_nlp_spit["created_at"].min().strftime("%Y-%m-%d %H:%M:%S"),
            "end": df_nlp_spit["created_at"].max().strftime("%Y-%m-%d %H:%M:%S"),
        }
        for df_nlp_spit, split_type in zip(
            [df_train, df_infer_data], ["train", "inference"]
        )
    ]
)
df_split_dates

Unnamed: 0,split,start,end
0,train,2021-12-30 17:39:11,2022-01-08 15:14:33
1,inference,2022-01-09 01:18:13,2022-01-10 01:29:01


In [67]:
train_end = df_train["created_at"].max()
infer_start = df_infer_data["created_at"].min()
assert train_end < infer_start

In [69]:
assert df_infer_data.drop(columns=["text", "split"]).equals(
    df_infer_data_with_preds[list(df_infer_data)].drop(columns=["text", "split"])
)

In [78]:
X_train, y_train = [df_train["text"], df_train["labels"]]
X_infer_data, y_infer_data = [
    df_infer_data_with_preds["text"],
    df_infer_data_with_preds["labels"],
]

In [71]:
pipe = Pipeline([("clf", DummyClassifier(strategy="uniform", random_state=88))])

In [79]:
%%time
_ = pipe.fit(X_train, y_train)

CPU times: user 669 µs, sys: 195 µs, total: 864 µs
Wall time: 826 µs


In [91]:
%%time
y_infer_pred = (
    pd.Series(pipe.predict(X_infer_data), name='label', index=y_infer_data.index)
    .astype(pd.Int32Dtype())
)

CPU times: user 1.71 ms, sys: 0 ns, total: 1.71 ms
Wall time: 1.64 ms


In [97]:
df_infer_data_with_preds = df_infer_data_with_preds.assign(
    pred_naive=y_infer_pred
).assign(response_time=lambda df: df["num_words"] * (1 / (135 / 2)) * (60 / 1))

In [98]:
metrics_dict_inf, df_cm_inf, df_cr_inf = calculate_metrics(
    df_infer_data_with_preds["labels"].astype("float64").to_numpy(),
    df_infer_data_with_preds["pred_naive"].astype("float64").to_numpy(),
    list(label_mapper.values()),
    list(label_mapper.keys()),
    "weighted",
    0,
    use_sample_weights=False,
)
df_metrics = pd.DataFrame.from_dict(metrics_dict_inf, orient="index").T
df_cr_inf = df_cr_inf.merge(
    df_infer_data_with_preds["labels"]
    .value_counts(normalize=True)
    .rename("freq")
    .reset_index()
    .assign(index=lambda df: df["index"].map(id2label))
    .set_index("index"),
    left_index=True,
    right_index=True,
    how="left",
)
display(df_metrics)
display(df_cm_inf)
display(df_cr_inf)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2
0,0.515,0.516833,0.538858,0.515,0.520564,0.529993,0.515681


Unnamed: 0,Actual,does_not_need_support,needs_support
0,does_not_need_support,185,179
1,needs_support,112,124


Unnamed: 0,precision,recall,f1-score,support,freq
does_not_need_support,0.622896,0.508242,0.559758,364,0.606667
needs_support,0.409241,0.525424,0.460111,236,0.393333


In [99]:
metrics_dict_inf, df_cm_inf, df_cr_inf = calculate_metrics(
    df_infer_data_with_preds["labels"].astype("float64").to_numpy(),
    df_infer_data_with_preds["pred"].astype("float64").to_numpy(),
    list(label_mapper.values()),
    list(label_mapper.keys()),
    "weighted",
    0,
    use_sample_weights=False,
)
df_metrics = pd.DataFrame.from_dict(metrics_dict_inf, orient="index").T
df_cr_inf = df_cr_inf.merge(
    df_infer_data_with_preds["labels"]
    .value_counts(normalize=True)
    .rename("freq")
    .reset_index()
    .assign(index=lambda df: df["index"].map(id2label))
    .set_index("index"),
    left_index=True,
    right_index=True,
    how="left",
)
display(df_metrics)
display(df_cm_inf)
display(df_cr_inf)

Unnamed: 0,accuracy,balanced_accuracy,precision,recall,f1,f05,f2
0,0.805,0.80725,0.812648,0.805,0.806673,0.80975,0.805141


Unnamed: 0,Actual,does_not_need_support,needs_support
0,does_not_need_support,290,74
1,needs_support,43,193


Unnamed: 0,precision,recall,f1-score,support,freq
does_not_need_support,0.870871,0.796703,0.832138,364,0.606667
needs_support,0.722846,0.817797,0.767396,236,0.393333


In [101]:
(
    df_infer_data_with_preds[
        [
            "id",
            "created_at",
            "labels",
            "pred",
            "pred_naive",
            "num_words",
            "response_time",
        ]
    ].head(3)
)

Unnamed: 0,id,created_at,labels,pred,pred_naive,num_words,response_time
0,1480304997506113541,2022-01-09 22:26:28,0,0,0,25,22.222222
1,1480245603233845251,2022-01-09 18:30:28,0,0,0,33,29.333333
2,1480122742678736899,2022-01-09 10:22:15,1,1,1,20,17.777778


In [103]:
df_missed = df_infer_data_with_preds.query("(labels == 1) & (pred == 0)")
df_wasted = df_infer_data_with_preds.query("(labels == 0) & (pred == 1)")
df_missed_naive = df_infer_data_with_preds.query("(labels == 1) & (pred_naive == 0)")
df_wasted_naive = df_infer_data_with_preds.query("(labels == 0) & (pred_naive == 1)")

In [125]:
dict_missed = dict(
    batch_num=inference_current_batch_num,
    time_missed=df_missed["response_time"].sum() / 60,
    time_missed_naive=df_missed_naive["response_time"].sum() / 60,
)
dict_wasted = dict(
    batch_num=inference_current_batch_num,
    time_wasted=df_wasted["response_time"].sum() / 60,
    time_wasted_naive=df_wasted_naive["response_time"].sum() / 60,
)
df_summary = pd.concat(
    [
        pd.DataFrame.from_dict(dict_missed, orient="index"),
        pd.DataFrame.from_dict(dict_wasted, orient="index"),
    ]
)
df_summary.T.assign(total_number_tweets=len(df_infer_data_with_preds)).assign(
    num_needs_support=len(df_infer_data_with_preds.query("labels == 1"))
).assign(
    pct_reduction_in_time_wasted=lambda df: 100
    * (df["time_wasted_naive"] - df["time_wasted"])
    / df["time_wasted_naive"]
).assign(
    pct_reduction_in_time_missed=lambda df: 100
    * (df["time_missed_naive"] - df["time_missed"])
    / df["time_missed_naive"]
).assign(
    num_tweets_missed=len(df_missed)
).assign(
    num_tweets_missed_naive=len(df_missed_naive)
).assign(
    num_tweets_unnecessarily_read=len(df_wasted)
).assign(
    num_tweets_unnecessarily_read_naive=len(df_wasted_naive)
).assign(
    frac_tweets_unnecessarily_read=lambda df: df["num_tweets_unnecessarily_read"]
    / len(df_infer_data_with_preds)
).assign(
    frac_tweets_missed=lambda df: df["num_tweets_missed"]
    / len(df_infer_data_with_preds)
).assign(
    frac_tweets_unnecessarily_read_naive=lambda df: df[
        "num_tweets_unnecessarily_read_naive"
    ]
    / len(df_infer_data_with_preds)
).assign(
    frac_tweets_missed_naive=lambda df: df["num_tweets_missed_naive"]
    / len(df_infer_data_with_preds)
).assign(
    response_time_true=df_infer_data_with_preds.query("labels == 1")[
        "response_time"
    ].sum()
    / 60
).assign(
    response_time_pred=df_infer_data_with_preds.query("pred == 1")[
        "response_time"
    ].sum()
    / 60
).assign(
    response_time_pred_naive=df_infer_data_with_preds.query("pred_naive == 1")[
        "response_time"
    ].sum()
    / 60
).T

Unnamed: 0,0
batch_num,1.0
time_missed,13.2
time_missed_naive,45.244444
batch_num,1.0
time_wasted,29.555556
time_wasted_naive,59.422222
total_number_tweets,600.0
num_needs_support,236.0
pct_reduction_in_time_wasted,50.26178
pct_reduction_in_time_missed,70.825147


In [107]:
prefix = f"{path_to_folder[1:]}processed/nlp_splits/metrics__inference_starts_20220110_000000__batch_1__with_preds.parquet.gzip"
fpath = f"s3://{s3_bucket_name}/{prefix}"
fpath

's3://sagemakertestwillz3s/datasets/twitter/kinesis-demo/processed/nlp_splits/metrics__inference_starts_20220110_000000__batch_1__with_preds.parquet.gzip'

In [122]:
pd.read_parquet(fpath).T

Unnamed: 0,0,1
batch_num,1,1
total_number_tweets,600,600
num_needs_support,303,267
frac_needs_support,0.393333,0.393333
pred_frac_needs_support,0.505,0.445
num_tweets_needed_true,236,236
num_tweets_needed_pred,303,267
response_time_pred,107.748146,109.925926
response_time_true,93.570374,93.570374
pred_type,naive,ML


---

<span style="float:left;">
    <a href="./3-combine-data/notebooks/3_combine_data.ipynb"><< 3 - Combine Hourly Streamed Tweets</a>
</span>

<span style="float:right;">
    <a href="./5-process-data/notebooks/5_process_data.ipynb">5 - Data Processing >></a>
</span>