Name: Muley, Tushar
Assignment: 9-1
Date: 25 October 2020

In [1]:
from __future__ import print_function, division

%matplotlib inline

import numpy as np

import random

import thinkstats2
import thinkplot

In [2]:
class HypothesisTest(object):

    def __init__(self, data):
        self.data = data
        self.MakeModel()
        self.actual = self.TestStatistic(data)

    def PValue(self, iters=1000):
        self.test_stats = [self.TestStatistic(self.RunModel()) 
                           for _ in range(iters)]

        count = sum(1 for x in self.test_stats if x >= self.actual)
        return count / iters

    def TestStatistic(self, data):
        raise UnimplementedMethodException()

    def MakeModel(self):
        pass

    def RunModel(self):
        raise UnimplementedMethodException()

In [3]:
class CoinTest(HypothesisTest):

    def TestStatistic(self, data):
        heads, tails = data
        test_stat = abs(heads - tails)
        return test_stat

    def RunModel(self):
        heads, tails = self.data
        n = heads + tails
        sample = [random.choice('HT') for _ in range(n)]
        hist = thinkstats2.Hist(sample)
        data = hist['H'], hist['T']
        return data

In [5]:
class DiffMeansPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = abs(group1.mean() - group2.mean())
        return test_stat

    def MakeModel(self):
        group1, group2 = self.data
        self.n, self.m = len(group1), len(group2)
        self.pool = np.hstack((group1, group2))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data

In [6]:
import first

live, firsts, others = first.MakeFrames()
data = firsts.prglngth.values, others.prglngth.values

In [7]:
#Check data
live

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.8750
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,9.1250
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,7.0000
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,6.1875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13581,12568,2,,,,,5.0,,1.0,,...,0,0,0,2734.687353,4258.980140,7772.212858,2,28,,6.3750
13584,12569,2,,,,,6.0,,1.0,,...,0,0,0,2580.967613,2925.167116,5075.164946,2,61,,6.3750
13588,12571,1,,,,,6.0,,1.0,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,6.1875
13591,12571,4,,,,,6.0,,1.0,,...,0,0,0,4670.540953,5795.692880,6269.200989,1,78,,7.5000


In [8]:
#Check data
firsts

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.301740,8567.549110,12999.542264,2,12,,9.1250
5,6,1,,,,,6.0,,1.0,,...,0,0,0,4870.926435,5325.196999,8874.440799,1,23,,8.5625
8,7,1,,,,,5.0,,1.0,,...,0,0,0,3409.579565,3787.539000,6911.879921,2,14,,7.5625
10,12,1,,,,,5.0,,1.0,,...,0,0,0,3612.781968,4146.013572,6909.331618,1,31,,7.8125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13576,12565,1,,,,,6.0,,1.0,,...,0,0,0,3195.641221,3834.241709,6652.409365,1,78,,6.4375
13578,12566,1,,,,,6.0,,1.0,,...,0,0,0,2080.317155,2422.820274,2627.548587,2,2,,6.0000
13581,12568,2,,,,,5.0,,1.0,,...,0,0,0,2734.687353,4258.980140,7772.212858,2,28,,6.3750
13584,12569,2,,,,,6.0,,1.0,,...,0,0,0,2580.967613,2925.167116,5075.164946,2,61,,6.3750


In [11]:
class DiffMeansOneSided(DiffMeansPermute):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = group1.mean() - group2.mean()
        return test_stat

In [13]:
class DiffStdPermute(DiffMeansPermute):

    def TestStatistic(self, data):
        group1, group2 = data
        test_stat = group1.std() - group2.std()
        return test_stat

In [15]:
class CorrelationPermute(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        xs, ys = data
        test_stat = abs(thinkstats2.Corr(xs, ys))
        return test_stat

    def RunModel(self):
        xs, ys = self.data
        xs = np.random.permutation(xs)
        return xs, ys

In [18]:
class DiceTest(thinkstats2.HypothesisTest):

    def TestStatistic(self, data):
        observed = data
        n = sum(observed)
        expected = np.ones(6) * n / 6
        test_stat = sum(abs(observed - expected))
        return test_stat

    def RunModel(self):
        n = sum(self.data)
        values = [1, 2, 3, 4, 5, 6]
        rolls = np.random.choice(values, n, replace=True)
        hist = thinkstats2.Hist(rolls)
        freqs = hist.Freqs(values)
        return freqs

In [20]:
class DiceChiTest(DiceTest):

    def TestStatistic(self, data):
        observed = data
        n = sum(observed)
        expected = np.ones(6) * n / 6
        test_stat = sum((observed - expected)**2 / expected)
        return test_stat

In [22]:
class PregLengthTest(thinkstats2.HypothesisTest):

    def MakeModel(self):
        firsts, others = self.data
        self.n = len(firsts)
        self.pool = np.hstack((firsts, others))

        pmf = thinkstats2.Pmf(self.pool)
        self.values = range(35, 44)
        self.expected_probs = np.array(pmf.Probs(self.values))

    def RunModel(self):
        np.random.shuffle(self.pool)
        data = self.pool[:self.n], self.pool[self.n:]
        return data
    
    def TestStatistic(self, data):
        firsts, others = data
        stat = self.ChiSquared(firsts) + self.ChiSquared(others)
        return stat

    def ChiSquared(self, lengths):
        hist = thinkstats2.Hist(lengths)
        observed = np.array(hist.Freqs(self.values))
        expected = self.expected_probs * len(lengths)
        stat = sum((observed - expected)**2 / expected)
        return stat

In [24]:
def FalseNegRate(data, num_runs=1000):
    """Computes the chance of a false negative based on resampling.

    data: pair of sequences
    num_runs: how many experiments to simulate

    returns: float false negative rate
    """
    group1, group2 = data
    count = 0

    for i in range(num_runs):
        sample1 = thinkstats2.Resample(group1)
        sample2 = thinkstats2.Resample(group2)
        ht = DiffMeansPermute((sample1, sample2))
        p_value = ht.PValue(iters=101)
        if p_value > 0.05:
            count += 1

    return count / num_runs

## Exercises

**Exercise:** As sample size increases, the power of a hypothesis test increases, which means it is more likely to be positive if the effect is real. Conversely, as sample size decreases, the test is less likely to be positive even if the effect is real.

To investigate this behavior, run the tests in this chapter with different subsets of the NSFG data. You can use `thinkstats2.SampleRows` to select a random subset of the rows in a DataFrame.

What happens to the p-values of these tests as sample size decreases? What is the smallest sample size that yields a positive test?

In [26]:
#What happens to the p-values of these tests as sample size decreases? 
#What is the smallest sample size that yields a positive test?

#My finds:
#Name	    Size	P1  	P2  	P3  	P4
#sample1	150 	0.067	0.598	0.4 	0.01
#sample2	300 	0.358	0.24	0.467	0.13
#sample3	500 	0.538	0.089	0.116	0.005
#sample4	1000	0.216	0.132	0.212	0.006
#sample5	2000	0.357	0.034	0.005	0.0
#sample6 	5000	0.637	0.0 	0.0 	0.0
#sample7	9148	0.163	0.0 	0.0 	0.0

# p1 = Compare pregnancy lengths
# p2 = Baby's weight 
# p3 = Test correlation
# p4 = Compare pregnancy lengths (chi-squared)


#If I understand this correctly... 
#P1 - First test Mean Pregnancy lengths goes up as sample size goes down. Seems all over the place. Sample size 150
#it is 0.067 but drops as sample size goes up until you get to sample 5. 
#It might because I have replacement to true.

#P2 - Second test of mean babies weight which also goes up as number samples goes down until sample 5. 
#Smaller size causes the mean to be exaggerated.

#P3 - Third test of correlation between mothers age and baby weight. This seems kind of all over the place. 
#It is not the same as the others. It stays 0.0 for the two largest sample sizes. Then it seem to be 
#all over the place. 

#P4 - Fourth test comparing pregency lenths. Similar to the third test is is erratic. 
#For the larger samples it remains zero. But for the smaller sample under 1000 there is a change between 150 and
#300 but between 500 and 1000 sample the change is small.

In [27]:
#What happens to the p-values of these tests as sample size decreases? 
#What is the smallest sample size that yields a positive test?

In [28]:
def SampleMyRows(df, nrows, replace=True):
    """Choose a sample of rows from a DataFrame.

    df: DataFrame
    nrows: number of rows
    replace: whether to sample with replacement

    returns: DataDf
    """
    indices = np.random.choice(df.index, nrows, replace=replace)
    sample = df.loc[indices]
    return sample

In [29]:
#Build samples:
firsts_sample1 = thinkstats2.SampleRows(firsts,150)
others_sample1 = thinkstats2.SampleRows(others,150)

firsts_sample2 = thinkstats2.SampleRows(firsts,300)
others_sample2 = thinkstats2.SampleRows(others,300)

firsts_sample3 = thinkstats2.SampleRows(firsts,500)
others_sample3 = thinkstats2.SampleRows(others,500)

firsts_sample4 = thinkstats2.SampleRows(firsts,1000)
others_sample4 = thinkstats2.SampleRows(others,1000)

firsts_sample5 = thinkstats2.SampleRows(firsts,2000)

firsts_sample7 = firsts #<--Full size
others_sample7 = others #<--Full size



In [30]:
#Copied SampleRows and changed it SampleMyRows to remove the False for replacement since I was getting and error.

others_sample5 = SampleMyRows(others,2000) #<--Replace as true

firsts_sample6 = SampleMyRows(firsts,5000) #<--Replace as true
others_sample6 = SampleMyRows(others,5000) #<--Replace as true

In [31]:
firsts_sample1

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
9949,9186,2,,,,,6.0,,1.0,,...,0,0,0,1914.471020,3211.192461,3482.538224,2,69,,8.2500
13439,12415,2,,,,,6.0,,1.0,,...,0,0,0,2580.625342,3002.831758,5050.462734,2,82,,7.0000
4524,4030,1,,,,,6.0,,1.0,,...,0,0,0,3410.735147,3684.576713,6140.347093,2,6,,8.8125
1593,1411,1,,,,,6.0,,1.0,,...,0,0,0,3409.702591,3785.794032,5325.476465,2,27,,7.5000
13484,12470,1,,,,,6.0,,1.0,,...,0,0,0,3762.204998,4232.574151,4578.375459,1,59,,7.0625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5826,5335,1,,,,,6.0,,1.0,,...,0,0,0,2286.128835,2856.554004,4804.438179,2,10,,8.9375
7424,6725,1,,,,,6.0,,1.0,,...,0,0,0,6229.218323,10180.750374,13235.993561,1,67,,6.6875
645,600,1,,,,,6.0,,1.0,,...,0,0,0,1806.556032,2230.707644,3384.652708,1,2,,2.1875
747,695,1,,,,,6.0,,1.0,,...,0,0,0,2418.543712,2814.991732,3959.848870,2,78,,9.5000


In [32]:
print('first_sample1',len(firsts_sample1))
print('other_sample1',len(others_sample1))

print('first_sample2',len(firsts_sample2))
print('other_sample2',len(others_sample2))

print('first_sample3',len(firsts_sample3))
print('other_sample3',len(others_sample3))

print('first_sample4',len(firsts_sample4))
print('other_sample4',len(others_sample4))

print('first_sample5',len(firsts_sample5))
print('other_sample5',len(others_sample5))

print('first_sample6',len(firsts_sample6))
print('other_sample6',len(others_sample6))



first_sample1 150
other_sample1 150
first_sample2 300
other_sample2 300
first_sample3 500
other_sample3 500
first_sample4 1000
other_sample4 1000
first_sample5 2000
other_sample5 2000
first_sample6 5000
other_sample6 5000


In [33]:
live_sample1 = thinkstats2.SampleRows(live,150)

live_sample2 = thinkstats2.SampleRows(live,300)

live_sample3 = thinkstats2.SampleRows(others,500)

live_sample4 = thinkstats2.SampleRows(firsts,1000)
live_sample5= thinkstats2.SampleRows(firsts,2000)
live_sample6= SampleMyRows(firsts,5000)

live_sample7 = live #<--Full size


In [34]:
others_sample1

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
6346,5765,3,,,,,6.0,,1.0,,...,0,0,0,3409.479438,3977.136980,6627.898778,2,32,,7.5000
12067,11151,4,,,,,5.0,,1.0,,...,0,0,0,2335.358403,2677.674141,2903.937605,1,45,,9.0625
3387,3011,4,,,,,6.0,,1.0,,...,0,0,0,3771.785182,4368.145883,6144.670814,1,2,,6.7500
12970,11966,5,,,,,6.0,,1.0,,...,0,0,0,3409.498280,3761.106315,6863.642914,1,59,,9.1875
1298,1161,2,,,,,5.0,,1.0,,...,0,0,0,4871.584259,5829.366417,10638.010775,2,6,,7.9375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4231,3764,4,,,,,6.0,,1.0,,...,0,0,0,4878.904636,6407.912985,11693.800406,1,10,,7.6250
3069,2719,3,,,,,5.0,,1.0,,...,0,0,0,2668.294914,2878.849757,3653.548226,2,10,,6.8750
1405,1251,3,,,,,6.0,,1.0,,...,0,0,0,3410.552837,5358.497639,6800.469353,1,69,,6.6250
9591,8839,4,,,,,6.0,,1.0,,...,0,0,0,2335.153425,2489.698542,3159.676420,2,56,,6.6875


In [35]:
#Run my tests sample 1:
# compare pregnancy lengths
data = firsts_sample1.prglngth.values, others_sample1.prglngth.values
ht = DiffMeansPermute(data)
p1 = ht.PValue(iters=1000)

print('sample1 pregency length',p1)

# Baby's weight
data = (firsts_sample1.totalwgt_lb.dropna().values,
        others_sample1.totalwgt_lb.dropna().values)
ht = DiffMeansPermute(data)
p2 = ht.PValue(iters=1000)

print('sample1 Babys weight', p2)

# test correlation
live2 = live_sample1.dropna(subset=['agepreg', 'totalwgt_lb'])
data = live2.agepreg.values, live2.totalwgt_lb.values
ht = CorrelationPermute(data)
p3 = ht.PValue(iters=1000)

print('sample1 correlation', p3)

# compare pregnancy lengths (chi-squared)
data = firsts_sample1.prglngth.values, others_sample1.prglngth.values
ht = PregLengthTest(data)
p4 = ht.PValue(iters=1000)

print('sample1 chi-squared', p4)


sample1 pregency length 0.067
sample1 Babys weight 0.598
sample1 correlation 0.4
sample1 chi-squared 0.01


In [36]:
#sample 2
# compare pregnancy lengths
data = firsts_sample2.prglngth.values, others_sample2.prglngth.values
ht = DiffMeansPermute(data)
p1 = ht.PValue(iters=1000)

print('sample2 pregency length',p1)

# Baby's weight
data = (firsts_sample2.totalwgt_lb.dropna().values,
        others_sample2.totalwgt_lb.dropna().values)
ht = DiffMeansPermute(data)
p2 = ht.PValue(iters=1000)

print('sample2 Babys weight', p2)

# test correlation
live2 = live_sample2.dropna(subset=['agepreg', 'totalwgt_lb'])
data = live2.agepreg.values, live2.totalwgt_lb.values
ht = CorrelationPermute(data)
p3 = ht.PValue(iters=1000)

print('sample2 correlation', p3)

# compare pregnancy lengths (chi-squared)
data = firsts_sample2.prglngth.values, others_sample2.prglngth.values
ht = PregLengthTest(data)
p4 = ht.PValue(iters=1000)

print('sample2 chi-squared', p4)

sample2 pregency length 0.358
sample2 Babys weight 0.24
sample2 correlation 0.467
sample2 chi-squared 0.13


In [37]:
#Sample 3
# compare pregnancy lengths 
data = firsts_sample3.prglngth.values, others_sample3.prglngth.values
ht = DiffMeansPermute(data)
p1 = ht.PValue(iters=1000)

print('sample3 pregency length',p1)

# Baby's weight
data = (firsts_sample3.totalwgt_lb.dropna().values,
        others_sample3.totalwgt_lb.dropna().values)
ht = DiffMeansPermute(data)
p2 = ht.PValue(iters=1000)

print('sample3 Babys weight', p2)

# test correlation
live2 = live_sample3.dropna(subset=['agepreg', 'totalwgt_lb'])
data = live2.agepreg.values, live2.totalwgt_lb.values
ht = CorrelationPermute(data)
p3 = ht.PValue(iters=1000)

print('sample3 correlation', p3)

# compare pregnancy lengths (chi-squared)
data = firsts_sample1.prglngth.values, others_sample1.prglngth.values
ht = PregLengthTest(data)
p4 = ht.PValue(iters=1000)

print('sample3 chi-squared', p4)


sample3 pregency length 0.534
sample3 Babys weight 0.089
sample3 correlation 0.116
sample3 chi-squared 0.005


In [38]:
#Sample 4
# compare pregnancy lengths 
data = firsts_sample4.prglngth.values, others_sample4.prglngth.values
ht = DiffMeansPermute(data)
p1 = ht.PValue(iters=1000)

print('sample4 pregency length',p1)

# Baby's weight
data = (firsts_sample4.totalwgt_lb.dropna().values,
        others_sample4.totalwgt_lb.dropna().values)
ht = DiffMeansPermute(data)
p2 = ht.PValue(iters=1000)

print('sample4 Babys weight', p2)

# test correlation
live2 = live_sample4.dropna(subset=['agepreg', 'totalwgt_lb'])
data = live2.agepreg.values, live2.totalwgt_lb.values
ht = CorrelationPermute(data)
p3 = ht.PValue(iters=1000)

print('sample4 correlation', p3)

# compare pregnancy lengths (chi-squared)
data = firsts_sample1.prglngth.values, others_sample1.prglngth.values
ht = PregLengthTest(data)
p4 = ht.PValue(iters=1000)

print('sample4 chi-squared', p4)

sample4 pregency length 0.216
sample4 Babys weight 0.132
sample4 correlation 0.212
sample4 chi-squared 0.006


In [45]:
#Sample 5
# compare pregnancy lengths 
data = firsts_sample5.prglngth.values, others_sample5.prglngth.values
ht = DiffMeansPermute(data)
p1 = ht.PValue(iters=1000)

print('sample5 pregency length',p1)

# Baby's weight
data = (firsts_sample5.totalwgt_lb.dropna().values,
        others_sample5.totalwgt_lb.dropna().values)
ht = DiffMeansPermute(data)
p2 = ht.PValue(iters=1000)

print('sample5 Babys weight', p2)

# test correlation
live2 = live_sample5.dropna(subset=['agepreg', 'totalwgt_lb'])
data = live2.agepreg.values, live2.totalwgt_lb.values
ht = CorrelationPermute(data)
p3 = ht.PValue(iters=1000)

print('sample5 correlation', p3)

# compare pregnancy lengths (chi-squared)
data = firsts_sample5.prglngth.values, others_sample5.prglngth.values
ht = PregLengthTest(data)
p4 = ht.PValue(iters=1000)

print('sample5 chi-squared', p4)

sample5 pregency length 0.357
sample5 Babys weight 0.034
sample5 correlation 0.005
sample5 chi-squared 0.0


In [40]:
#Sample 6
# compare pregnancy lengths 
data = firsts_sample6.prglngth.values, others_sample6.prglngth.values
ht = DiffMeansPermute(data)
p1 = ht.PValue(iters=1000)

print('sample6 pregency length',p1)

# Baby's weight
data = (firsts_sample6.totalwgt_lb.dropna().values,
        others_sample6.totalwgt_lb.dropna().values)
ht = DiffMeansPermute(data)
p2 = ht.PValue(iters=1000)

print('sample6 Babys weight', p2)

# test correlation
live2 = live_sample6.dropna(subset=['agepreg', 'totalwgt_lb'])
data = live2.agepreg.values, live2.totalwgt_lb.values
ht = CorrelationPermute(data)
p3 = ht.PValue(iters=1000)

print('sample6 correlation', p3)

# compare pregnancy lengths (chi-squared)
data = firsts_sample6.prglngth.values, others_sample6.prglngth.values
ht = PregLengthTest(data)
p4 = ht.PValue(iters=1000)

print('sample6 chi-squared', p4)

sample6 pregency length 0.637
sample6 Babys weight 0.0
sample6 correlation 0.0
sample6 chi-squared 0.0


In [41]:
len(live)

9148

In [42]:
#Sample 7
# compare pregnancy lengths 
data = firsts_sample7.prglngth.values, others_sample7.prglngth.values
ht = DiffMeansPermute(data)
p1 = ht.PValue(iters=1000)

print('sample7 pregency length',p1)

# Baby's weight
data = (firsts_sample7.totalwgt_lb.dropna().values,
        others_sample7.totalwgt_lb.dropna().values)
ht = DiffMeansPermute(data)
p2 = ht.PValue(iters=1000)

print('sample7 Babys weight', p2)

# test correlation
live2 = live_sample7.dropna(subset=['agepreg', 'totalwgt_lb'])
data = live2.agepreg.values, live2.totalwgt_lb.values
ht = CorrelationPermute(data)
p3 = ht.PValue(iters=1000)

print('sample7 correlation', p3)

# compare pregnancy lengths (chi-squared)
data = firsts_sample7.prglngth.values, others_sample7.prglngth.values
ht = PregLengthTest(data)
p4 = ht.PValue(iters=1000)

print('sample7 chi-squared', p4)

sample7 pregency length 0.163
sample7 Babys weight 0.0
sample7 correlation 0.0
sample7 chi-squared 0.0


In [50]:
print('others_sample5',len(others_sample5))
print('live_sample5',len(live_sample5.dropna(subset=['agepreg', 'totalwgt_lb'])))
print('firsts_sample5',len(firsts_sample5))

print('others_sample6',len(others_sample6))
print('live_sample6',len(live_sample6.dropna(subset=['agepreg', 'totalwgt_lb'])))
print('firsts_sample6',len(firsts_sample6))

print('others_sample7',len(others_sample7))
print('live_sample7',len(live_sample7.dropna(subset=['agepreg', 'totalwgt_lb'])))
print('firsts_sample7',len(firsts_sample7))

others_sample5 2000
live_sample5 1975
firsts_sample5 2000
others_sample6 5000
live_sample6 4950
firsts_sample6 5000
others_sample7 4735
live_sample7 9038
firsts_sample7 4413
