# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [4]:
from __future__ import print_function, division

import nsfg
import pandas as pd

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [5]:
preg = nsfg.ReadFemPreg()
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [60]:
for item in sorted(preg.columns):
    print(item)

adj_mod_basewgt
ageatend
agecon
agecon_i
agepreg
agepreg_i
ageqtnur
ageqtnur2
ageqtnur3
ageqtnur_n
ageqtnur_n2
ageqtnur_n3
ageqtnur_p
ageqtnur_p2
ageqtnur_p3
ager
ager_i
agescrn
alivenow
alivenow2
alivenow3
anynurse
anynurse2
anynurse3
anyusint
babysex
babysex2
babysex3
basewgt
bfeedwks
bfeedwks_i
bgnprena
birthord
birthord_i
birthplc
birthwgt_lb
birthwgt_lb2
birthwgt_lb3
birthwgt_oz
birthwgt_oz2
birthwgt_oz3
bpa_bdscheck1
bpa_bdscheck2
bpa_bdscheck3
brnout
caseid
cmbabdob
cmbirth
cmfstprg
cmintfin
cmintfincr
cmintfinop
cmintstr
cmintstrcr
cmintstrop
cmintvw
cmkidied
cmkidied2
cmkidied3
cmkidlft
cmkidlft2
cmkidlft3
cmlastlb
cmlstprg
cmotpreg
cmprgbeg
cmprgend
cnfrmno
cohpbeg
cohpend
datecon
datecon_i
datend
datend_i
didwork
dk1gest
dk2gest
dk3gest
educat
educat_i
evuseint
fedsolid
fedsolid2
fedsolid3
feelinpg
finalwgt
flgdkmo1
fmarcon5
fmarcon5_i
fmarital
fmarital_i
fmarout5
fmarout5_i
frsteatd
frsteatd2
frsteatd3
frsteatd_n
frsteatd_n2
frsteatd_n3
frsteatd_p
frsteatd_p2
frsteatd_p3
ge

Select a single column name.

In [7]:
preg.columns[1]

'pregordr'

Select a column and check what type it is.

In [8]:
pregordr = preg['pregordr']
type(pregordr)

pandas.core.series.Series

Print a column.

In [9]:
pregordr

0        1
1        2
2        1
3        2
4        3
5        1
6        2
7        3
8        1
9        2
10       1
11       1
12       2
13       3
14       1
15       2
16       3
17       1
18       2
19       1
20       2
21       1
22       2
23       1
24       2
25       3
26       1
27       1
28       2
29       3
        ..
13563    2
13564    3
13565    1
13566    1
13567    1
13568    2
13569    1
13570    2
13571    3
13572    4
13573    1
13574    2
13575    1
13576    1
13577    2
13578    1
13579    2
13580    1
13581    2
13582    3
13583    1
13584    2
13585    1
13586    2
13587    3
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

Select a single element from a column.

In [10]:
pregordr[0]

1

Select a slice from a column.

In [11]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

Select a column using dot notation.

In [12]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [13]:
preg.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [14]:
preg.birthwgt_lb.describe()

count    9084.000000
mean        6.832122
std         1.411447
min         0.000000
25%         6.000000
50%         7.000000
75%         8.000000
max        15.000000
Name: birthwgt_lb, dtype: float64

In [15]:
bigboys = preg.birthwgt_lb[preg.birthwgt_lb > 10]
bigboys

104      12.0
105      12.0
106      14.0
178      11.0
253      11.0
256      11.0
819      11.0
903      13.0
1061     11.0
1238     11.0
2556     11.0
2824     11.0
2894     11.0
3045     11.0
3595     11.0
3959     11.0
4097     13.0
4100     12.0
4173     11.0
4412     11.0
4421     12.0
5010     11.0
5093     12.0
5117     11.0
6024     11.0
6733     13.0
7298     12.0
7299     11.0
7308     15.0
7398     12.0
7406     12.0
7485     11.0
9561     11.0
9599     11.0
9661     11.0
9734     12.0
9735     12.0
10139    11.0
10969    11.0
11103    11.0
12099    11.0
12801    14.0
12802    14.0
Name: birthwgt_lb, dtype: float64

In [16]:
preg.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [17]:
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

print(type(preg_map))
type(indices)

<class 'collections.defaultdict'>


list

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [18]:
preg.birthord.head()

0    1.0
1    2.0
2    1.0
3    2.0
4    3.0
Name: birthord, dtype: float64

In [19]:
preg.birthord.value_counts().sort_index()

1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

We can also use `isnull` to count the number of nans.

In [20]:
preg.birthord.isnull().sum()

4445

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [21]:
preg.prglngth.value_counts()

39    4744
40    1120
38     609
9      594
41     591
6      543
37     457
13     446
4      412
8      409
35     357
36     329
42     328
17     253
11     202
30     198
5      181
7      175
12     170
3      151
43     148
22     147
10     137
32     122
26     117
2       78
34      60
33      50
44      46
16      44
15      39
28      38
21      37
19      34
24      31
31      29
14      29
29      23
20      18
18      17
0       15
25      15
23      12
45      10
1        9
27       8
48       7
50       2
46       1
47       1
Name: prglngth, dtype: int64

In [22]:
bins = [-1,13,26,50,100]

r = pd.cut(preg.prglngth,bins)
print(r)
result = r.value_counts().sort_index()
result

0        (26, 50]
1        (26, 50]
2        (26, 50]
3        (26, 50]
4        (26, 50]
5        (26, 50]
6        (26, 50]
7        (26, 50]
8        (26, 50]
9        (26, 50]
10       (26, 50]
11       (26, 50]
12       (26, 50]
13       (-1, 13]
14       (-1, 13]
15       (26, 50]
16       (26, 50]
17       (26, 50]
18       (-1, 13]
19       (26, 50]
20       (26, 50]
21       (26, 50]
22       (-1, 13]
23       (26, 50]
24       (26, 50]
25       (26, 50]
26       (26, 50]
27       (26, 50]
28       (26, 50]
29       (26, 50]
           ...   
13563    (26, 50]
13564    (26, 50]
13565    (26, 50]
13566    (26, 50]
13567    (-1, 13]
13568    (-1, 13]
13569    (26, 50]
13570    (26, 50]
13571    (26, 50]
13572    (26, 50]
13573    (26, 50]
13574    (26, 50]
13575    (-1, 13]
13576    (26, 50]
13577    (26, 50]
13578    (26, 50]
13579    (26, 50]
13580    (-1, 13]
13581    (26, 50]
13582    (-1, 13]
13583    (13, 26]
13584    (26, 50]
13585    (-1, 13]
13586    (-1, 13]
13587    (

(-1, 13]     3522
(13, 26]      793
(26, 50]     9278
(50, 100]       0
Name: prglngth, dtype: int64

To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [23]:
preg.prglngth.isnull().sum()

0

In [24]:
len(preg.prglngth)

13593

In [25]:
preg.totalwgt_lb.mean()

7.265628457623368

Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [26]:
preg['totalwgt_kg'] = preg.totalwgt_lb * 0.453592
preg.totalwgt_kg.head()

0    3.997279
1    3.572037
2    4.139027
3    3.175144
4    2.806601
Name: totalwgt_kg, dtype: float64

In [27]:
preg.totalwgt_lb.head()

0    8.8125
1    7.8750
2    9.1250
3    7.0000
4    6.1875
Name: totalwgt_lb, dtype: float64

`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [28]:
resp = nsfg.ReadFemResp()

`DataFrame` provides a method `head` that displays the first five rows:

In [29]:
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [30]:
resp.age_r.value_counts().sort_index()

15    217
16    223
17    234
18    235
19    241
20    258
21    267
22    287
23    282
24    269
25    267
26    260
27    255
28    252
29    262
30    292
31    278
32    273
33    257
34    255
35    262
36    266
37    271
38    256
39    215
40    256
41    250
42    215
43    253
44    235
Name: age_r, dtype: int64

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [31]:
resp[resp.caseid==2298]

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667


And we can get the corresponding rows from `preg` like this:

In [32]:
preg[preg.caseid==2298]

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
2610,2298,1,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.118445
2611,2298,2,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,5.5,2.494756
2612,2298,3,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,4.1875,1.899417
2613,2298,4,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.118445


How old is the respondent with `caseid` 1?

In [33]:
resp[resp.caseid==1].age_r

1069    44
Name: age_r, dtype: int64

What are the pregnancy lengths for the respondent with `caseid` 2298?

In [34]:
preg[preg.caseid==2298].prglngth

2610    40
2611    36
2612    30
2613    40
Name: prglngth, dtype: int64

What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [35]:
preg[preg.caseid==5012].totalwgt_lb

5515    6.0
Name: totalwgt_lb, dtype: float64

In [36]:
pregnum_map = nsfg.MakePregMap(preg)

In [37]:
len(pregnum_map.keys())

5033

In [38]:
len(resp.keys())

3087

In [39]:
len(preg.keys())

245

In [40]:
len(pregnum_map[481])

7

In [41]:
resp[resp.caseid == 481].pregnum

2736    7
Name: pregnum, dtype: int64

In [42]:
# DOESN'T 
# count = 0
# matches = 0
# for key, value in pregnum_map.items():
#     if len(pregnum_map[key]) :
#         preg_val = len(pregnum_map[key])
#     resp_val = resp[resp.caseid == key].pregnum
    
#     if preg_val == resp_val:
#         print("TRUE")
#         matches += 1
#     count += 1

# print(count)
# print(matches)

In [66]:
col = ['caseid', 'prglngth', 'outcome', 
       'pregordr', 'birthord', 'birthwgt_lb', 
       'birthwgt_oz','agepreg','finalwgt']

In [67]:
pregClean = preg[col]
pregClean

Unnamed: 0,caseid,prglngth,outcome,pregordr,birthord,birthwgt_lb,birthwgt_oz,agepreg,finalwgt
0,1,39,1,1,1.0,8.0,13.0,33.16,6448.271112
1,1,39,1,2,2.0,7.0,14.0,39.25,6448.271112
2,2,39,1,1,1.0,9.0,2.0,14.33,12999.542264
3,2,39,1,2,2.0,7.0,0.0,17.83,12999.542264
4,2,39,1,3,3.0,6.0,3.0,18.33,12999.542264
5,6,38,1,1,1.0,8.0,9.0,27.00,8874.440799
6,6,40,1,2,2.0,9.0,9.0,28.83,8874.440799
7,6,42,1,3,3.0,8.0,6.0,30.16,8874.440799
8,7,39,1,1,1.0,7.0,9.0,28.08,6911.879921
9,7,35,1,2,2.0,6.0,10.0,32.33,6911.879921


In [70]:
resp[['caseid','pregnum']]

Unnamed: 0,caseid,pregnum
0,2298,4
1,5012,1
2,11586,1
3,6794,0
4,616,0
5,845,8
6,10333,0
7,855,0
8,8656,3
9,3566,0


In [101]:
len(pregClean[pregClean['caseid'] == 4184])

3

In [102]:
type(pregClean[pregClean['caseid'] == 4184])

pandas.core.frame.DataFrame

In [79]:
type(pregnum_map)

collections.defaultdict

In [103]:
import collections
counts = collections.Counter(pregnum_map)
counts.most_common(10)

cnt = collections.Counter()
for k, v in pregnum_map.items():
    cnt[k] += len(v)

cnt.most_common(5)

cnt

# cnt  ==  CounterCounter()()
#  for  director, movies in directors.items():
#     cnt[director] += len(movies)

# cnt.most_common(5)

# determine if the resp['pregnum'] value is equal to the cnt vale

for k, v in cnt.items():
    if v == resp['pregnum'][k]:
        print('Match')
        print(k)
        print(v)
        print(resp['pregnum'][k])
        break
    else:
#         print('No Match')
        continue

Match
18
2
2
