In [1]:
import pandas as pd

In [2]:
# quick lesson on functions

In [3]:
def my_function():
    # indent 4 spaces
    # function code here
    pass  # used for a completely empty function

In [4]:
def my_sq(x):
    """Squares a given value
    """
    return x ** 2

In [5]:
def avg_2(x, y):
    """Calculates the average between 2 numbers
    """
    return (x + y) / 2  # note that this will be 2.0 in Python 2

In [6]:
my_sq(4)

16

In [7]:
avg_2(10, 20)

15.0

# apply basic

In [8]:
df = pd.DataFrame({
    'a': [10, 20, 30],
    'b': [20, 30, 40]
})
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [9]:
# let's use our square function
# yes we could've also done the calculation directly on the column
df['a'] ** 2

0    100
1    400
2    900
Name: a, dtype: int64

In [10]:
df['a'].apply(my_sq)

0    100
1    400
2    900
Name: a, dtype: int64

In [11]:
def my_exp(x, e):
    """raise x to the e power
    """
    return x ** 3

In [12]:
# apply a function by providing multiple arguments
df['a'].apply(my_exp, e=2)

0     1000
1     8000
2    27000
Name: a, dtype: int64

In [13]:
df['a'].apply(my_exp, e=3)

0     1000
1     8000
2    27000
Name: a, dtype: int64

## Apply over a dataframe

In [14]:
df

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


In [15]:
def print_me(x):
    """just prints what you give it
    """
    print(x)

#### column-wise

In [16]:
df.apply(print_me)

0    10
1    20
2    30
Name: a, dtype: int64
0    20
1    30
2    40
Name: b, dtype: int64


a    None
b    None
dtype: object

In [17]:
# note the print_me function printed the same
# results as below
df['a']

0    10
1    20
2    30
Name: a, dtype: int64

In [18]:
df['b']

0    20
1    30
2    40
Name: b, dtype: int64

In [19]:
def avg_3(x, y, z):
    """Calculate average between 3 numbers
    """
    return (x + y + z) / 3

In [20]:
# use the function on each column
# this will cause an error
df.apply(avg_3)

TypeError: ("avg_3() missing 2 required positional arguments: 'y' and 'z'", 'occurred at index a')

In [21]:
# have to re-write the function
# in an apply, the entire column (or row)
# get's passed in to the first agument
def avg_3_apply(col):
    """Calculate average between 3 numbers
    """
    x = col[0]
    y = col[1]
    z = col[2]
    return (x + y + z) / 3
df.apply(avg_3_apply)

a    20.0
b    30.0
dtype: float64

### row-wise

In [22]:
# we can't just use the same function as before
df.apply(avg_3_apply, axis=1)

IndexError: ('index out of bounds', 'occurred at index 0')

In [23]:
def avg_2_apply(row):
    x = row[0]
    y = row[1]
    return (x + y) / 2
df.apply(avg_2_apply, axis=1)

0    15.0
1    25.0
2    35.0
dtype: float64

In [24]:
df  # df to compare data

Unnamed: 0,a,b
0,10,20
1,20,30
2,30,40


# for realz this time

In [25]:
import seaborn as sns
titanic = sns.load_dataset('titanic')

In [26]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [27]:
# we have some missing values
# let's calculate counts and frequencies
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


write 3 functions

1. number of missing values
2. proportion of missing values
3. proportion of complete values

In [28]:
import numpy as np

def count_missing(vec):
    """counts the number of missing values in a vector
    """
    null_vec = pd.isnull(vec) # vector of True/False
    null_count = np.sum(null_vec) # True has a value of 1
    return null_count

In [29]:
def prop_missing(vec):
    """proportion of missing values in a vector
    """
    num = count_missing(vec)
    dem = vec.size
    return num / dem

In [30]:
def prop_complete(vec):
    """proportion of complete observations
    """
    return 1 - prop_missing(vec)

In [31]:
titanic.apply(count_missing)

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [32]:
titanic.apply(prop_missing)

survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

In [33]:
titanic.apply(prop_complete)

survived       1.000000
pclass         1.000000
sex            1.000000
age            0.801347
sibsp          1.000000
parch          1.000000
fare           1.000000
embarked       0.997755
class          1.000000
who            1.000000
adult_male     1.000000
deck           0.227834
embark_town    0.997755
alive          1.000000
alone          1.000000
dtype: float64

In [34]:
# always check how things are missing
# is it random?
# completely at random?
# is it a pattern?
titanic.loc[pd.isnull(titanic['embark_town']), :]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
61,1,1,female,38.0,0,0,80.0,,First,woman,False,B,,yes,True
829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,,yes,True


### row-wise

In [35]:
cmis_row = titanic.apply(count_missing, axis=1)

In [36]:
cmis_row.head()

0    1
1    0
2    1
3    0
4    1
dtype: int64

In [37]:
# count how many rows have multiple missing values
cmis_row.value_counts()

1    549
0    182
2    160
dtype: int64

In [38]:
titanic['num_missing'] = cmis_row

In [39]:
# randomly sample 10 values
# from the rows that have more than 1 missing value
titanic.loc[titanic['num_missing'] > 1, :].sample(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,num_missing
140,0,3,female,,0,2,15.2458,C,Third,woman,False,,Cherbourg,no,False,2
260,0,3,male,,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,2
295,0,1,male,,0,0,27.7208,C,First,man,True,,Cherbourg,no,True,2
611,0,3,male,,0,0,7.05,S,Third,man,True,,Southampton,no,True,2
697,1,3,female,,0,0,7.7333,Q,Third,woman,False,,Queenstown,yes,True,2
107,1,3,male,,0,0,7.775,S,Third,man,True,,Southampton,yes,True,2
29,0,3,male,,0,0,7.8958,S,Third,man,True,,Southampton,no,True,2
61,1,1,female,38.0,0,0,80.0,,First,woman,False,B,,yes,True,2
613,0,3,male,,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,2
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True,2
