In [20]:
import pandas as pd 

In [2]:
s = pd.Series([43,32,56,124,12,76,88])

In [3]:
s

0     43
1     32
2     56
3    124
4     12
5     76
6     88
dtype: int64

In [24]:
s.sample(frac = 1, replace = True)

1     32
5     76
3    124
5     76
1     32
3    124
6     88
dtype: int64

In [25]:
s.sample(frac = 2, replace = True)

0     43
4     12
2     56
6     88
2     56
0     43
5     76
0     43
3    124
4     12
5     76
4     12
0     43
4     12
dtype: int64

In [4]:
y = pd.Series([76,56,89,54,67,91,83])

In [5]:
y

0    76
1    56
2    89
3    54
4    67
5    91
6    83
dtype: int64

In [6]:
y.mean() - s.mean()

12.142857142857139

**Suppose we have 2 different datasets which belongs to same company with different times and we want to analyze the differences between 2 dataset such as mean profit, decrease in sales etc. Problem is We have a small dataset and we want to do statistical analysis on it. One thing we can do is, we can increase the observations in our dataset via bootstrapping. It helps us to do statistical analysis on more data. After bootstrapping, we can find confidence intervals in order to make sure that our hypothesis is correct or not.**

In [7]:
boot_s = s.sample(frac = 1, replace = True)

In [8]:
s, boot_s

(0     43
 1     32
 2     56
 3    124
 4     12
 5     76
 6     88
 dtype: int64,
 5     76
 3    124
 6     88
 1     32
 6     88
 2     56
 4     12
 dtype: int64)

In [15]:
boot_y = y.sample(frac = 1, replace = True)
y, boot_y

(0    76
 1    56
 2    89
 3    54
 4    67
 5    91
 6    83
 dtype: int64,
 1    56
 4    67
 2    89
 6    83
 2    89
 4    67
 4    67
 dtype: int64)

**Basically, via using sample method as shown in above, we can resample our data with replacement / without replacement. Thus, we can gather a different version of data which was generated from the original one. If we do this operation like 1000s times, we can significantly increase our dataset which looks like the original dataset. It is a trick which statisticians use in order to find confidence intervals about their hypothesis. Lets go further on create 1000 more data points which looks like our original data and then calculate the confidence intervals.**

In [16]:
boot_mean_diff = [] 

In [17]:
for i in range(1000):
    boot_s = s.sample(frac = 1, replace = True)
    boot_y = y.sample(frac = 1, replace = True)
    boot_mean_diff.append(boot_y.mean() - boot_s.mean())

In [18]:
boot_mean_diff

[25.714285714285722,
 17.285714285714285,
 18.285714285714292,
 18.14285714285714,
 5.142857142857139,
 -26.714285714285708,
 17.714285714285715,
 12.142857142857153,
 18.999999999999993,
 -16.428571428571416,
 21.999999999999993,
 12.57142857142857,
 15.714285714285708,
 34.14285714285714,
 14.714285714285708,
 4.285714285714292,
 -6.428571428571431,
 8.0,
 -2.857142857142861,
 10.0,
 38.71428571428571,
 3.4285714285714306,
 3.0,
 8.857142857142861,
 -4.142857142857153,
 14.285714285714292,
 37.42857142857143,
 -8.714285714285722,
 6.285714285714292,
 -4.571428571428569,
 20.42857142857143,
 4.0,
 15.57142857142857,
 12.0,
 -6.571428571428569,
 17.857142857142854,
 9.857142857142854,
 0.5714285714285694,
 23.14285714285714,
 6.857142857142861,
 11.142857142857139,
 15.857142857142861,
 16.42857142857143,
 -3.2857142857142776,
 -5.0,
 -3.857142857142861,
 25.142857142857146,
 4.857142857142847,
 -9.0,
 13.285714285714285,
 -10.57142857142857,
 26.285714285714285,
 -5.857142857142861,
 

In [19]:
diff = pd.Series(boot_mean_diff)

In [20]:
len(diff)

1000

In [21]:
diff.mean()

12.259857142857145

**Lets find %95, %50 confidence intervals of bootstrapped data.**

In [24]:
diff.quantile([0.025, 0.975])

0.025   -14.725000
0.975    38.717857
dtype: float64

In [25]:
diff.quantile([0.25,0.75])

0.25     2.285714
0.75    22.142857
dtype: float64

### Bootstrapping using Numpy

In [1]:
import numpy as np

In [2]:
my_array = np.array([87,98,4,56,24,3,78,55,25,9])

In [3]:
my_array

array([87, 98,  4, 56, 24,  3, 78, 55, 25,  9])

In [4]:
boot_array = np.random.choice(my_array, size = len(my_array))

In [5]:
boot_array

array([ 3, 98,  9,  3, 25, 24, 25, 87,  3,  4])

In [6]:
boot_array2 = np.random.choice(my_array, size = len(my_array))

In [7]:
boot_array2

array([55, 98, 55,  9, 87, 24, 78,  3, 56,  4])

In [12]:
boot_array_list = []

In [13]:
for i in range(10):
    boot_arr = np.random.choice(my_array, size = len(my_array))
    boot_array_list.append(boot_arr)

In [14]:
boot_array_list

[array([55, 24, 78, 25, 25, 78, 56, 56,  9, 98]),
 array([56, 24,  3, 55,  3, 24, 56, 87, 24, 24]),
 array([ 3, 98,  9,  9, 98,  4, 78,  3, 25, 98]),
 array([87, 87,  3,  3,  3, 55, 78, 78, 78,  4]),
 array([98, 78, 78, 87, 78, 24,  3, 24,  3,  9]),
 array([98, 56,  4, 24, 56, 55,  4, 55, 87,  9]),
 array([98, 24, 24, 25, 24, 24, 25, 24,  3, 24]),
 array([87, 78,  9, 55,  9, 78, 98, 78, 98, 56]),
 array([ 4, 87, 56,  9, 24,  9, 78, 24, 78,  3]),
 array([56, 78, 78, 55, 56, 78,  4, 56, 55, 24])]

In [15]:
boot_arrays = np.array(boot_array_list)

In [16]:
boot_arrays

array([[55, 24, 78, 25, 25, 78, 56, 56,  9, 98],
       [56, 24,  3, 55,  3, 24, 56, 87, 24, 24],
       [ 3, 98,  9,  9, 98,  4, 78,  3, 25, 98],
       [87, 87,  3,  3,  3, 55, 78, 78, 78,  4],
       [98, 78, 78, 87, 78, 24,  3, 24,  3,  9],
       [98, 56,  4, 24, 56, 55,  4, 55, 87,  9],
       [98, 24, 24, 25, 24, 24, 25, 24,  3, 24],
       [87, 78,  9, 55,  9, 78, 98, 78, 98, 56],
       [ 4, 87, 56,  9, 24,  9, 78, 24, 78,  3],
       [56, 78, 78, 55, 56, 78,  4, 56, 55, 24]])

In [17]:
boot_arrays.ndim

2

In [18]:
len(boot_arrays)

10

In [19]:
boot_arrays[5]

array([98, 56,  4, 24, 56, 55,  4, 55, 87,  9])