## BootStrap Example

In this example, we will perform bookstrapping technique on a synthetic dataset 'skewdata.csv'

Check if the Spark and SparkSQL context have started successfully

In [1]:
print sc
print sqlContext
print sqlCtx

<pyspark.context.SparkContext object at 0x7f02d38dc650>
<pyspark.sql.context.HiveContext object at 0x7f02d38bce90>
<pyspark.sql.context.HiveContext object at 0x7f02d38bce90>


Load the dataset

In [2]:
data_df = (sqlContext.read
                  .format('com.databricks.spark.csv')
                  .option("header", "true") # Use first line of all files as header
                  .option("inferSchema", "true") # Automatically infer data types
                  .load("skewdata.csv")
                  )

In [3]:
data_df.show()

+-----------+
|     values|
+-----------+
|81.37291811|
|25.70097086|
|4.942646012|
|43.02085256|
|81.69058902|
|51.19523649|
|55.65990905|
|15.15315474|
|38.74578007|
|12.61038468|
|22.41509375|
| 18.3557207|
|38.08150137|
|48.17113476|
|18.46272527|
|44.64225129|
|25.39108197|
|20.41087394|
|15.77818657|
|19.35148454|
+-----------+
only showing top 20 rows



Define a function to find the confidence interval

In [4]:
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *

Calculate the mean of a column in Spark DataFrame 
```
data_df.selectExpr("avg(values) as mean").collect()[0].asDict().get('mean')
```
Alternative way to calculate the mean
```
data_df.select(avg("values")).collect()[0].asDict().get('avg(values)')
```

In [5]:
## Function to get confidence interval

def getConfidenceInterval(inputDataFrame,num_of_samples, left_quantile_fraction, right_quantile_fraction):
    #Simulate by sampling and calculating averages for each subsamples
    sample_means = np.empty([num_of_samples])
    for n in range(0,num_of_samples):
        sample_means[n] = (inputDataFrame.sample(withReplacement = True, fraction=1.0)
                   .selectExpr("avg(values) as mean")
                   .collect()[0]
                   .asDict()
                   .get('mean'))
            
    ## Sort the means
    sample_means.sort()
    
    ## Create a Pandas Dataframe from the numpy array
    sampleMeans_local_df = pd.DataFrame(sample_means)
    
    ## Create a Spark Dataframe from the pandas dataframe
    fields = [StructField("mean_values", DoubleType(), True)]
    schema = StructType(fields)
    sampleMeans_df = sqlContext.createDataFrame(sampleMeans_local_df, schema)
    
    ## Calculate the left_quantile and right_quantiles 
    sqlContext.registerDataFrameAsTable(sampleMeans_df, 'Guru_SampleMeansTable')
    quantiles_df = sqlContext.sql("select percentile(cast(mean_values as bigint),"
                                  "array("+str(left_quantile_fraction)+","+str(right_quantile_fraction)+")) as "
                                  "percentiles from Guru_SampleMeansTable")
    return quantiles_df

In [6]:
## Get 95% confidence interval in a two-tailed hypothesis testing
quantiles_df = getConfidenceInterval(data_df, 1000, 0.025, 0.975)

In [7]:
## We can now look at these percentiles and determine the critical region of sampling distribution
quantiles_df.show()

+------------+
| percentiles|
+------------+
|[24.0, 37.0]|
+------------+

