# Combining `.xlsx` Files for NLP Manual Labeling

In [1]:
%load_ext lab_black

In [2]:
from glob import glob

import pandas as pd

## About

## User Inputs

In [3]:
temp_processed_data_dir = "../data"
processed_data_dir = "../data/processed"

In [4]:
dtypes_dict = dict(
    id=pd.StringDtype(),
    text=pd.StringDtype(),
    sentiment=pd.Int32Dtype(),
)

## Get Data, Drop Last Duplicate and Re-Order

In [5]:
%%time
df_train, df_val, df_test = [
    pd.concat(
        [
            pd.read_excel(f, dtype=dtypes_dict, parse_dates=['created_at'])
            for f in glob(f"{temp_processed_data_dir}/{split_type}_*.xlsx")
        ],
        ignore_index=True,
    ).sort_values(by=['sentiment'])
    .drop_duplicates(subset=['text'], keep='first')
    .sort_index()
    for split_type in ['train', 'val', 'test']
]
display(df_train.sample(3).style.set_caption(f"Total training data for manual labeling = {len(df_train):,}"))
display(df_val.sample(3).style.set_caption(f"Total validation data for manual labeling = {len(df_val):,}"))
display(df_test.sample(3).style.set_caption(f"Total testing data for manual labeling = {len(df_test):,}"))

Unnamed: 0,id,created_at,text,sentiment
2742,1479141744700760075,2022-01-06 17:24:07,Why is there no camera onboard the James Webb Space Telescope?,
2,1478198711339536384,2022-01-04 02:56:51,Good morning! The James Webb Space Telescope is now 5.5 lakh kilometres from its L2 orbit. Its distance from Earth is 8.8 lakh kilometres. Its journey to L2 is now more than 61% complete.,2.0
205,1479832192545173514,2022-01-08 15:07:43,NASA live streaming the James Webb telescope unfolding. Science makes me happy....,


Unnamed: 0,id,created_at,text,sentiment
179,1480012233451229187,2022-01-09 03:03:08,"If you could imagine the Webb telescope trying to deploy far from intelligent life for almost 62 years, well you're looking at him",
152,1480004850507259906,2022-01-09 02:33:48,My BBC World Service Radio Interview on James Webb Telescope:,
206,1479911279460372483,2022-01-08 20:21:59,"Yesss!!! What a milestone! James Webb Space Telescope mirrors unfolded successfully today! Nice shirt (with ""Jan 08, 2022"" on the shirt) for celebrating this special day. Order link:",


Unnamed: 0,id,created_at,text,sentiment
3,1480126412593848322,2022-01-09 10:36:50,NASA's Webb Telescope reaches major milestone as mirror unfolds 719,2.0
198,1480279600739594241,2022-01-09 20:45:33,James Webb Telescope Ready to Reveal the Secrets of the Universe,
111,1480135789438586882,2022-01-09 11:14:06,"Good news for science, space exploration and humanity. James Webb telescope is now fully deployed.",


CPU times: user 333 ms, sys: 16.5 ms, total: 350 ms
Wall time: 348 ms


## Export to `.xlsx` File

In [6]:
inference_start_date_str = (
    glob(f"{temp_processed_data_dir}/train_*.xlsx")[0]
    .split("nlp__inference_starts")[-1]
    .split(".")[0][1:]
)
inference_start_date_str

'20220110_000000'

In [8]:
%%time
for df_split_to_export, split_type in zip([df_train, df_val, df_test], ['train', 'val', 'test']):
    fname = f"{processed_data_dir}/{split_type}_nlp__inference_starts_{inference_start_date_str}.xlsx"
    df_split_to_export.to_excel(fname, index=False)

CPU times: user 345 ms, sys: 12.4 ms, total: 357 ms
Wall time: 355 ms
