# Data extraction

## Undersample using ID and labels

In [None]:
import numpy as np
import pandas as pd

In [None]:
train_labels_df = pd.read_csv("trainLabels.csv")

In [None]:
train_labels_df

Unnamed: 0,Id,Class
0,01kcPWA9K2BOxQeS5Rju,1
1,04EjIdbPV5e1XroFOpiN,1
2,05EeG39MTRrI6VY21DPd,1
3,05rJTUWYAKNegBk2wE8X,1
4,0AnoOZDNbPXIr2MRBSCJ,1
...,...,...
10863,KFrZ0Lop1WDGwUtkusCi,9
10864,kg24YRJTB8DNdKMXpwOH,9
10865,kG29BLiFYPgWtpb350sO,9
10866,kGITL4OJxYMWEQ1bKBiP,9


In [None]:
train_labels_df['Class'].value_counts()

Class
3    2942
2    2478
1    1541
8    1228
9    1013
6     751
4     475
7     398
5      42
Name: count, dtype: int64

In [None]:
sampled_ids = train_labels_df[train_labels_df["Class"] != 5].groupby('Class').sample(n=200, random_state=42)['Id'].tolist()

In [None]:
sampled_file_paths_bytes = list(map(lambda sample_id: f"train/{sample_id}.bytes", sampled_ids))

In [None]:
sampled_file_paths_asm = list(map(lambda sample_id: f"train/{sample_id}.asm", sampled_ids))

## Extract files based on sample

In [None]:
import py7zr
import time

In [None]:
train_file = py7zr.SevenZipFile("train.7z", "r")

In [None]:
start = time.time()

# only extract files needed
train_file.extract(targets=sampled_file_paths_bytes + sampled_file_paths_asm)

end = time.time()

print("Time taken: {}m {}s".format(int((end-start)//60), int((end-start)%60)))

Time taken: 8 m 36 s


## Read extracted files

In [None]:
# example using one file
flattened_data = []
with open(sampled_file_paths_bytes[0]) as byte_file:
    for line in byte_file.readlines():
        flattened_data.extend(line.split()[1:])

In [None]:
flattened_data[1000:1010]

## Train Test Split of IDs

To run this code, please run the entire section of 'Undersample using ID and labels' first. You DO NOT NEED to run anything else (especially extracting files) as they are not important for this part and takes too long to run.

In [None]:
# split train and test proportionally for each of the class
test_df = train_labels_df.loc[train_labels_df["Id"].isin(sampled_ids)].groupby("Class").sample(frac=0.2, random_state=42)
train_df = train_labels_df.loc[train_labels_df["Id"].isin(sampled_ids)].loc[~train_labels_df["Id"].isin(test_df["Id"])]

In [None]:
# create 2 csv file for train labels and test labels
test_df.to_csv("test_labels.csv")
train_df.to_csv("train_labels.csv")