# Sampling Methods

> This module helps to sample a statistical distrubution for further comparison. 

In [None]:
#| default_exp sampling

In [None]:
#| hide
import pandas as pd
import os
import time
import torch
import gc
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt

In [None]:
#| export
import logging
logging.basicConfig(
    filename="logger_sampling.txt",
    filemode='a',
    format='%(asctime)s : %(levelname)s : %(message)s', 
    level=logging.INFO
    )

In [None]:
#| export
import numpy as np
from statistics import NormalDist

## Bootstrapping
Create a bootstrap sample given data and a function. For instance, a bootstrap sample of means, or mediands. The bootstrap replicates are a long as the original size we can choose any observation more than once (resampling with replacement:np.random.choice)

In [None]:
#| export
def bootstrapping( np_data, np_func, size, flag_clean_nan = False ):
    """
    @size: number of bootstrapping samples
    @np_funct: numpy function for reducing the samples (e.g., median, mean, max)
    @flag_clean_nan: flag to eliminate Nan values in the np tensor
    """
    #Cleaning NaNs
    if flag_clean_nan:
        np_data = np_data[ np.logical_not( np.isnan(np_data) ) ] 
    
    #Creating the boostrap replicates as long as the original data size
    #This strategy might work as imputation 
    bootstrap_repl = [ np_func( np.random.choice( np_data, size=len(np_data) ) ) for i in range( size ) ]
    
    logging.info("Empirical Estimate: " + str(np_func( np_data )) ) #Empirical Mean,Median,Max, etc
    logging.info("Bootstrapped Estimate: " + str( np_func( bootstrap_repl ) ) ) #Bootstrapped Mean,Median,Max, etc
    
    return np.array( bootstrap_repl )

### Confidence Intervals
To compute a confidence interval from sample data, it is used the z-score. Here we assume that the sample size is big enough to use the standard normal distribution rather than the student's t distribution to compute the z value (Ref: [link](https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data)). 

In [None]:
#| export
def confidence_intervals_large_samples(data, confidence=0.95):
    """
    @confidence: confidence interval 
    @return: tuple (lowerbound, uperbound, h-value)
    """
    dist = NormalDist.from_samples( data )
    z = NormalDist().inv_cdf((1 + confidence) / 2.)
    h = dist.stdev * z / ((len(data) - 1) ** .5)
    return dist.mean - h, dist.mean + h, h

### Standard Error
This partiruclar estimand is for computing the error of measured metric from a boostrapping.

In [None]:
#| export
def standard_error(bootstrapped_data):
    return np.std( bootstrapped_data )

# Testing

In [None]:
galeras_pd = pd.read_json( '/workspaces/StatisticalFace/semeru-datasets/semeru/galeras/galeras_se_tasks_dataset_3k_deduplicated/code_completion_dataset_3k_deduped.json' )

In [None]:
galeras_pd.describe()

Unnamed: 0,id,n_ast_errors,ast_levels,n_whitespaces,n_words,vocab_size,complexity,nloc,token_counts,n_ast_nodes,n_identifiers
count,2931.0,2931.0,2931.0,2931.0,2931.0,2931.0,2931.0,2931.0,2931.0,2931.0,2931.0
mean,156450.464347,0.101672,12.220744,220.087001,60.243262,40.125554,3.053907,17.722279,123.120778,202.66769,17.425793
std,92465.983718,0.304518,3.02595,443.775386,86.843623,40.784097,5.197449,24.543615,158.555402,242.136645,12.794952
min,280.0,0.0,4.0,4.0,3.0,3.0,1.0,2.0,7.0,10.0,1.0
25%,75496.5,0.0,10.0,52.0,19.0,17.0,1.0,6.0,42.5,73.0,9.0
50%,154885.0,0.0,12.0,105.0,34.0,28.0,1.0,11.0,76.0,128.0,14.0
75%,241670.0,0.0,14.0,232.5,71.0,50.5,3.0,20.0,147.0,242.0,22.0
max,338614.0,2.0,28.0,13912.0,1945.0,677.0,151.0,546.0,3772.0,3598.0,157.0


In [None]:
complexity_median_np = bootstrapping( galeras_pd.complexity.values, np_func=np.median, size=500, flag_clean_nan = False ) #Bootstrapped Complexity

In [None]:
complexity_mean_np = bootstrapping( galeras_pd.complexity.values, np_func=np.mean, size=500, flag_clean_nan = False ) #Bootstrapped Complexity

In [None]:
#Bootrapped Estimates
np.median( complexity_median_np ) , np.mean( complexity_mean_np )

(1.0, 3.049542135789833)

In [None]:
#Standard Errors
standard_error(complexity_median_np), standard_error(complexity_mean_np)

(0.4730285403651666, 0.0945633551509999)

In [None]:
## Confidence Interval Estimates
test_confidence_cyclo_median = confidence_intervals_large_samples(data = complexity_median_np, confidence=0.95)

In [None]:
test_confidence_cyclo_median

(1.296454891987364, 1.3795451080126362, 0.04154510801263605)

In [None]:
test_confidence_cyclo_mean = confidence_intervals_large_samples(data = complexity_mean_np, confidence=0.95)

In [None]:
test_confidence_cyclo_mean

(3.041236833955925, 3.05784743762374, 0.008305301833907814)

In [None]:
! nbdev_export