### Line Data Sampling
Generate samples from a dataset generated in _DataPreprocessing_.

*Please Note:* This functionality is line-level oriented. To sample all lines from functions please first sample on function level (_DataSamplingFunction.ipynb_) and then transform to line level (_DataPreprocessing.ipynb_). 

In [35]:
# input must be in format output by process_line_level_data.py
input_file = './big-vul_dataset/line_all.csv'

In [36]:
import pandas as pd
import numpy as np
import math
import csv

In [37]:
# load dataset
# keep_default_na=False ... otherwise quoted NULL is read as NaN (for whatever stupid reason...)
df = pd.read_csv(input_file, skipinitialspace=True, low_memory = True, keep_default_na=False)

In [38]:
# Separate vulnerable and non-vulnerable lines
df_with_vul = df.loc[df['vul'] == 1]
df_without_vul = df.loc[df['vul'] == 0]

In [39]:
# Sample 10 percent of original dataset while keeping the original ratio of vul to non-vul 
sample_df_with_vul = df_with_vul.sample(frac=0.1)
sample_df_without_vul = df_without_vul.sample(frac=0.1)

sample_df_all = pd.concat([sample_df_with_vul, sample_df_without_vul], ignore_index=True)
sample_df_all[['func_id', 'line', 'vul', 'idx_in_func']].to_csv('./big-vul_dataset/line_sample_10p_original_ratio.csv', encoding='utf-8', quoting=csv.QUOTE_ALL)

In [40]:
# Create a balanced dataset between vul and non-vul by keeping all original vulnerable entires and 
# sampling the same number of non-vulnerable entries
sample_df_without_vul = df_without_vul.sample(len(df_with_vul))

sample_df_all = pd.concat([df_with_vul, sample_df_without_vul], ignore_index=True)
sample_df_all[['func_id', 'line', 'vul', 'idx_in_func']].to_csv('./big-vul_dataset/line_sample_all_balanced_ratio.csv', encoding='utf-8', quoting=csv.QUOTE_ALL)

In [41]:
# Create a balanced dataset between vul and non-vul by sampling 20% of vulnerable entires and 
# adding the same number of non-vulnerable
sample_df_with_vul = df_with_vul.sample(frac=0.2)
sample_df_without_vul = df_without_vul.sample(len(sample_df_with_vul))

sample_df_all = pd.concat([sample_df_with_vul, sample_df_without_vul], ignore_index=True)
sample_df_all[['func_id', 'line', 'vul', 'idx_in_func']].to_csv('./big-vul_dataset/line_sample_20p_balanced_ratio.csv', encoding='utf-8', quoting=csv.QUOTE_ALL)