In [None]:
import pandas as pd
import gzip
import pickle
import dateutil
import datetime
from dateutil.relativedelta import *
import numpy.random as ra
import math
import random
import seaborn as sns
import numpy as np
import numbers
import uuid
import person
import quilt

In [None]:
from person import datetime

# Selecting a surname for a person

I have installed on `quiltdata.com` a file "2010_surnames.csv" that contains surname information from the 2010 US census. We will be using this file to generate names for persons we simulation with our Person class. The head of the file looks like this:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>name</th>
      <th>rank</th>
      <th>count</th>
      <th>prop100k</th>
      <th>cum_prop100k</th>
      <th>pctwhite</th>
      <th>pctblack</th>
      <th>pctapi</th>
      <th>pctaian</th>
      <th>pct2prace</th>
      <th>pcthispanic</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>SMITH</td>
      <td>1</td>
      <td>2376206</td>
      <td>880.85</td>
      <td>880.85</td>
      <td>73.35</td>
      <td>22.22</td>
      <td>0.40</td>
      <td>0.85</td>
      <td>1.63</td>
      <td>1.56</td>
    </tr>
    <tr>
      <th>1</th>
      <td>JOHNSON</td>
      <td>2</td>
      <td>1857160</td>
      <td>688.44</td>
      <td>1569.30</td>
      <td>61.55</td>
      <td>33.80</td>
      <td>0.42</td>
      <td>0.91</td>
      <td>1.82</td>
      <td>1.50</td>
    </tr>
    <tr>
      <th>2</th>
      <td>WILLIAMS</td>
      <td>3</td>
      <td>1534042</td>
      <td>568.66</td>
      <td>2137.96</td>
      <td>48.52</td>
      <td>46.72</td>
      <td>0.37</td>
      <td>0.78</td>
      <td>2.01</td>
      <td>1.60</td>
    </tr>
    <tr>
      <th>3</th>
      <td>BROWN</td>
      <td>4</td>
      <td>1380145</td>
      <td>511.62</td>
      <td>2649.58</td>
      <td>60.71</td>
      <td>34.54</td>
      <td>0.41</td>
      <td>0.83</td>
      <td>1.86</td>
      <td>1.64</td>
    </tr>
    <tr>
      <th>4</th>
      <td>JONES</td>
      <td>5</td>
      <td>1362755</td>
      <td>505.17</td>
      <td>3154.75</td>
      <td>57.69</td>
      <td>37.73</td>
      <td>0.35</td>
      <td>0.94</td>
      <td>1.85</td>
      <td>1.44</td>
    </tr>
  </tbody>
</table>

Use Pandas to read this file into a DataFrame named `surnames` that only has columns `count` and `name`.

In [None]:
quilt.install("u0069295/n2010_surnames")
from quilt.data.u0069295 import n2010_surnames as data

### Accessing our data

Calling `data()` will return a Pandas DataFrame

In [None]:
surnames = data()[]

In [None]:
data().head()

#### Create a DataFrame called `surnames` that only contains the `name` column

In [None]:
surnames = None
### BEGIN SOLUTION
surnames = data()[["name"]]
### END SOLUTION
print(surnames.shape)
surnames.head()

#### Write a function `get_last_name1` 

* takes a DataFrame with the surnames
* a keyword argument `col` containing the name of the column with the names
* a keyword arguemnt `seed` with default value `None`

In [None]:
def get_lastname1(surnames, col="name", seed=None):
    ### BEGIN SOLUTION

    random.seed(seed)
    return random.choice(surnames[col])
    ### END SOLUTION

In [None]:
get_lastname1(surnames)

#### Create a population of 1000 `Person` objects with randomly generated surnames (`last_name`)

In [None]:
population = [person.Person(first_name="", last_name=get_lastname1(surnames)) 
                  for i in range(1000)]

#### How could I look at the frequency of names?

In [None]:
from collections import Counter
Counter([p.last_name for p in population]).most_common(50)

### Create `DataFrame` with columns `name` and `count`

In [None]:
surnames = data()[["name", "count"]]
surnames.head()

### Filter `surnames` so we only keep names that occurred at least 10000 times

In [None]:
surnames = surnames[surnames["count"] >= 10000]

### Let's talk briefly about probabilities

#### Example: 

I fill a vase with 60 red balls and 40 black balls. If I shake all the balls up and blindly draw one from the vase what is the probability that the ball will be red?

How did we arrive at this?

What is the probability of drawing a black ball?

#### Add a column to `surnames` representing the probability of `name` occuring in the Census

### Some basic probability concepts

* A probability ($p$) has a value between zero and one: $0 \le p \le 1$
* The sum of all the probabilities of potential outcomes (e.g. flipping a coin is heads or tails) is one: $\sum p = 1$

* $\text{ probability heads } + \text{ probability tails} = 1$
* For a 6-sided die $p(1)+p(2)+p(3)+p(4)+p(5)+p(6)=1$
* For our names $p(\text{KASPER})+p(\text{ZELLER})+\cdots+p(\text{SMITH})=1$

In [None]:
surnames["probability"] = surnames["count"]/surnames["count"].sum()
surnames = surnames.sort_values(by="probability",ascending=True)
surnames.head()

#### The most common name occurs about 1.5% of the time

### Cumulative Probability

We need one more concept: [cumulative probability](https://stattrek.com/statistics/dictionary.aspx?definition=cumulative_probability).

>A cumulative probability refers to the probability that the value of a random variable falls within a specified range. Frequently, cumulative probabilities refer to the probability that a random variable is less than or equal to a specified value.

#### Example:

Cumulative probability that a die roll is less than or equal to 4:

\begin{eqnarray}
    \sum_{i\le 4} p(i) & = \\
    p(1) + p(2) + p(3) + p(4) & =\\
    \frac{1}{6} + \frac{1}{6} + \frac{1}{6} + \frac{1}{6} &=\\
    \frac{2}{3}
\end{eqnarray}    


### Compute the cumulative probabilities for our names

The cumulative probability of a name (e.g. "SKIPPER") is the sum of its probability plus the probability of all the names with lower probability (less common).

For example, the cumulative probability of the name "SKIPPER" is

\begin{eqnarray}
p(\text{KASPER})+p(\text{ZELLER}) + p(\text{SKIPPER}) & = &\\
0.000067 + 0.000067 + 0.000067 & = &0.000202
\end{eqnarray}

Result tail should look like this:

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>name</th>
      <th>count</th>
      <th>probability</th>
      <th>cumulative_probability</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>4</th>
      <td>JONES</td>
      <td>1362755</td>
      <td>0.009179</td>
      <td>0.951855</td>
    </tr>
    <tr>
      <th>3</th>
      <td>BROWN</td>
      <td>1380145</td>
      <td>0.009296</td>
      <td>0.961152</td>
    </tr>
    <tr>
      <th>2</th>
      <td>WILLIAMS</td>
      <td>1534042</td>
      <td>0.010333</td>
      <td>0.971485</td>
    </tr>
    <tr>
      <th>1</th>
      <td>JOHNSON</td>
      <td>1857160</td>
      <td>0.012510</td>
      <td>0.983994</td>
    </tr>
    <tr>
      <th>0</th>
      <td>SMITH</td>
      <td>2376206</td>
      <td>0.016006</td>
      <td>1.000000</td>
    </tr>
  </tbody>
</table>

In [None]:
surnames["cumulative_probability"] = surnames["probability"].cumsum()
surnames.head()

### Write a function `get_lastname2`

This should be similar to `get_lastname1` except the probability of a name being returned is equal to the probability of that name occuring in our population.

* Use `random.random`
* Use `surnames.iterrows()`

In [None]:
def get_lastname2(surnames, col = "name", seed=None):
    ### BEGIN SOLUTION

    random.seed(seed)
    v = random.random()
    tmp = surnames[surnames.cumulative_probability >=v]

    return tmp["name"].iloc[0]

In [None]:
population2 = [person.Person(first_name="", last_name=get_lastname2(surnames)) 
                  for i in range(1000)]

In [None]:
Counter([p.last_name for p in population2]).most_common(50)