### Function Data Sampling
Generate samples from a dataset generated by _process_line_level_data.py_.

In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
df = pd.read_csv('./big-vul_dataset/processed_data.csv', skipinitialspace=True, low_memory = True)

In [None]:
len(df)

In [None]:
# sorta weird NaN check (x == NaN if x != x)
df_with_vul = df.loc[df['flaw_line'] == df['flaw_line']]
len(df_with_vul)

# Also noteworthy:
# Why does this yield 8243 results while 10900 entries in the original dataset have vul = 1 ?

In [None]:
# sorta weird NaN check (x == NaN if x != x)
df_without_vul = df.loc[df['flaw_line'] != df['flaw_line']]
len(df_without_vul)

In [None]:
assert len(df) == len(df_with_vul) + len(df_without_vul), 'Some entries were lost during vul-filtering. Please check code!'

In [None]:
df_with_vul.tail(10)

In [None]:
# Sample 10 percent of original dataset while keeping the original ratio of vul to non-vul 
sample_df_with_vul = df_with_vul.sample(frac=0.1)
sample_df_without_vul = df_without_vul.sample(frac=0.1)

sample_df_all = pd.concat([sample_df_with_vul, sample_df_without_vul], ignore_index=True)
sample_df_all.to_csv('./big-vul_dataset/sample_10p_original_ratio.csv', encoding='utf-8')

In [None]:
# Create a balanced dataset between vul and non-vul by keeping all original vulnerable entires and 
# sampling the same number of non-vulnerable entries
sample_df_without_vul = df_without_vul.sample(len(df_with_vul))

sample_df_all = pd.concat([df_with_vul, sample_df_without_vul], ignore_index=True)
sample_df_all.to_csv('./big-vul_dataset/sample_all_balanced_ratio.csv', encoding='utf-8')

In [None]:
# Create a balanced dataset between vul and non-vul by sampling 20% of vulnerable entires and 
# adding the same number of non-vulnerable
sample_df_with_vul = df_with_vul.sample(frac=0.2)
sample_df_without_vul = df_without_vul.sample(len(sample_df_with_vul))

sample_df_all = pd.concat([sample_df_with_vul, sample_df_without_vul], ignore_index=True)
sample_df_all.to_csv('./big-vul_dataset/sample_20p_balanced_ratio.csv', encoding='utf-8')