In [8]:
import os
import pandas as pd 
import numpy as np
from pyunpack import Archive
from tqdm import tqdm
import ast
 
def create_new_column(df, column_name):
    df[column_name] = df.apply(lambda row: \
                                        1 if column_name in row['labels'] \
                                        else 0, \
                                        axis=1)
    return df

def make_csv(path):
    all_data = []
    for label in tqdm(labels):
        instances_in_a_label = os.listdir (f"{path}/{label}")
        for item in instances_in_a_label:
            f = open(f"{DATA_DIR}/{path}/{label}/{item}", "r", encoding='utf-8',errors='ignore')
            raw_data = f.read()
            all_data.append([item, raw_data, label])
    all_data = np.asarray(all_data)
    df = pd.DataFrame(all_data, columns=["id", "text", "label"])
    return df

DATA_DIR = "../data"

if os.path.exists(f"{DATA_DIR}/twentynewsgroup"):
    #--------load dataframe
    print("[  dataset  ] twentynewsgroup directory already exists, loading...")
    train_val_df = pd.read_csv(f"{DATA_DIR}/twentynewsgroup/twentynewsgroup_train.csv", encoding='utf-8')
    # for index, row in train_val_df.iterrows():
    #     try:
    #         ast.literal_eval(row['labels'])
    #     except:
    #         print(row.labels)
    train_val_df['labels'] = train_val_df.apply(lambda row: ast.literal_eval(row['labels']), axis=1)

    test_df = pd.read_csv(f"{DATA_DIR}/twentynewsgroup/twentynewsgroup_test.csv", encoding='utf-8')
    test_df['labels'] = test_df.apply(lambda row: ast.literal_eval(row['labels']), axis=1)

    # #--------loading and storing labels to mlflow
    labels = np.load(f"{DATA_DIR}/twentynewsgroup/labels.npy")
    num_labels = len(labels)
    # mlflowLogger.store_param("col_names", labels)
    # mlflowLogger.store_param("num_labels", num_labels)
    print("[  dataset  ] loaded!")
else:
    os.makedirs(f"{DATA_DIR}/twentynewsgroup")
    
    print("[  dataset  ] twentynewsgroup dataset is being downloaded...")
    os.system(f'wget -N -P {DATA_DIR}/twentynewsgroup http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz')

    # data/twentynewsgroup/20news-bydate.tar.gz
    print("[  dataset  ] Extracting twentynewsgroup dataset...")
    directory_to_extract_to = f"{DATA_DIR}/twentynewsgroup/"
    Archive(f"{DATA_DIR}/twentynewsgroup/20news-bydate.tar.gz").extractall(directory_to_extract_to)
    os.system(f"rm {DATA_DIR}/twentynewsgroup/20news-bydate.tar.gz")

    #------storing label details to mlflow and npy file
    labels = os.listdir (f"{DATA_DIR}/twentynewsgroup/20news-bydate-train")
    if ".DS_Store" in labels:
        labels.remove(".DS_Store")
    
    np.save(f"{DATA_DIR}/twentynewsgroup/labels.npy", labels)
    num_labels = len(labels)
    # mlflowLogger.store_param("col_names", labels)
    # mlflowLogger.store_param("num_labels", num_labels)

    #------convert the files to a dataframe 
    train_val_df = make_csv(f"{DATA_DIR}/twentynewsgroup/20news-bydate-train")
    test_df = make_csv(f"{DATA_DIR}/twentynewsgroup/20news-bydate-test")

    os.system(f"rm -r {DATA_DIR}/twentynewsgroup/20news-bydate-*")

     #------preprocessing the labels
    tqdm.pandas()
    print("\n[  dataset  ] twentynewsgroup preprocessing of labels begin...")
    train_val_df["labels"] = train_val_df.progress_apply(lambda row: train_val_df.loc[train_val_df['id'] == row['id']].label.tolist(), axis=1)
    test_df["labels"] = test_df.progress_apply(lambda row: test_df.loc[test_df['id'] == row['id']].label.tolist(), axis=1)

    os.system(f"rm -r {DATA_DIR}/twentynewsgroup/20news-bydate-*")

    #------remove duplicate rows
    train_val_df.drop_duplicates('id', inplace=True)
    test_df.drop_duplicates('id', inplace=True)

    #------bring labels to seperate columns
    for label in tqdm(labels):
        train_val_df = create_new_column(train_val_df, label)
        test_df = create_new_column(test_df, label)

    train_val_df.drop(['labels', "label"],inplace=True, axis=1)
    test_df.drop(['labels', "label"],inplace=True, axis=1)

    train_val_df['labels'] = train_val_df.apply(lambda row: row[labels].to_list(), axis=1)
    test_df['labels'] = test_df.apply(lambda row: row[labels].to_list(), axis=1)

    train_val_df.drop(labels, inplace=True, axis=1)
    test_df.drop(labels, inplace=True, axis=1)

    #------remove back quote from text
    train_val_df['text'] = train_val_df.apply(lambda row: row.text.replace("`", "'"), axis=1)
    test_df['text'] = test_df.apply(lambda row: row.text.replace("`", "'"), axis=1)

    #-------save the datafarme
    train_val_df.to_csv(f'{DATA_DIR}/twentynewsgroup/twentynewsgroup_train.csv', index=False)
    test_df.to_csv(f'{DATA_DIR}/twentynewsgroup/twentynewsgroup_test.csv', index=False)


[  dataset  ] twentynewsgroup directory already exists, loading...
[  dataset  ] loaded!


In [1]:
print("Hello world")

Hello world
