In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# Apply
__apply__ takes a function and “applies” (i.e., runs it) across each row or column of a dataframe “simultaneously.”

In [2]:
# Create a dataframe
df = pd.DataFrame({
    'a':[10,20,30,40,50],
    'b':[20,40,60,80,100],
    'c':[30,60,90,120,150]
})

In [3]:
df

Unnamed: 0,a,b,c
0,10,20,30
1,20,40,60
2,30,60,90
3,40,80,120
4,50,100,150


- When using __apply__ , all the objects (DataFrame or Series) will be passed on the first parameter of the function.
- If there are __two__ parameters, the other one must be assinged with value.

### Example 1

In [4]:
# Create a function
def div_by_x(val, x):
    return val/x

In [5]:
# Use "apply" on Series
df['a'].apply(div_by_x, x=2)

0     5.0
1    10.0
2    15.0
3    20.0
4    25.0
Name: a, dtype: float64

In [6]:
# Use "apply" on DataFrame
df.apply(div_by_x, x=3)

Unnamed: 0,a,b,c
0,3.333333,6.666667,10.0
1,6.666667,13.333333,20.0
2,10.0,20.0,30.0
3,13.333333,26.666667,40.0
4,16.666667,33.333333,50.0


In [7]:
# Use "apply" on DataFrame in column-wise
df.apply(div_by_x, x=4, axis=0)

Unnamed: 0,a,b,c
0,2.5,5.0,7.5
1,5.0,10.0,15.0
2,7.5,15.0,22.5
3,10.0,20.0,30.0
4,12.5,25.0,37.5


In [8]:
# Use "apply" on DataFrame in row-wise
df.apply(div_by_x, x=5, axis=1)

Unnamed: 0,a,b,c
0,2.0,4.0,6.0
1,4.0,8.0,12.0
2,6.0,12.0,18.0
3,8.0,16.0,24.0
4,10.0,20.0,30.0


### Example 2

In [9]:
# Create planets dataset
planets = sns.load_dataset("planets")

In [10]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [11]:
planets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB


In [12]:
def count_missing(vec):
    """Counts the number of missing values in a vector
    """
    # get a vector of True/False values depending whether the value is missing
    null_vec = pd.isnull(vec)
    # take the sum of the null_vec since non-null values do not contribute to the sum
    null_count = np.sum(null_vec)
    # return the number of missing values in the vector
    return null_count

In [13]:
def prop_missing(vec):
    """Percentage of missing values in a vector
    """
    # numerator: number of missing values we can use the count_missing function we just wrote!
    num = count_missing(vec)
    # denominator: total number of values in the vector we also need to count the missing values
    dem = vec.size
    # return the proportion/percentage of missing
    return num / dem

In [14]:
def prop_complete(vec):
    """Percentage of nonmissing values in a vector
    """
    # we can utilize the percent_missing function we just wrote by subtracting its value from 1
    return 1 - prop_missing(vec)

#### apply on column-wise
this way, we can see how many null values are missing in every column

In [15]:
planets.apply(count_missing)

method              0
number              0
orbital_period     43
mass              522
distance          227
year                0
dtype: int64

In [16]:
planets.apply(prop_missing)

method            0.000000
number            0.000000
orbital_period    0.041546
mass              0.504348
distance          0.219324
year              0.000000
dtype: float64

In [17]:
planets.apply(prop_complete)

method            1.000000
number            1.000000
orbital_period    0.958454
mass              0.495652
distance          0.780676
year              1.000000
dtype: float64

#### apply on row-wise
this way, we can create new column that can store number of null values for every row

In [18]:
planets['num_missing'] = planets.apply(count_missing, axis=1)
planets.sample(5)

Unnamed: 0,method,number,orbital_period,mass,distance,year,num_missing
386,Radial Velocity,1,256.78,8.44,38.99,1999,0
1008,Transit,1,2.838971,,455.0,2012,1
767,Transit,3,8.919346,,,2012,2
186,Transit,1,1.327347,,317.0,2011,1
125,Radial Velocity,4,5.36874,0.049,6.27,2005,0


In [19]:
planets['pct_missing'] = planets.apply(prop_missing, axis=1)
planets.sample(5)

Unnamed: 0,method,number,orbital_period,mass,distance,year,num_missing,pct_missing
627,Radial Velocity,1,16.2,1.25,223.21,2010,0,0.0
930,Transit,1,3.101278,,,2004,2,0.285714
339,Radial Velocity,2,95.415,0.0565,34.07,2011,0,0.0
474,Radial Velocity,3,27.582,0.0358,14.56,2011,0,0.0
274,Radial Velocity,1,1561.0,7.71,51.97,2003,0,0.0


In [20]:
planets['pct_nonmissing'] = planets.apply(prop_complete, axis=1)
planets.sample(5)

Unnamed: 0,method,number,orbital_period,mass,distance,year,num_missing,pct_missing,pct_nonmissing
3,Radial Velocity,1,326.03,19.4,110.62,2007,0,0.0,1.0
75,Imaging,1,8679.7,,26.67,2009,1,0.142857,0.875
991,Transit,1,3.577469,,343.0,2010,1,0.142857,0.875
168,Transit,1,4.008778,,215.0,2010,1,0.142857,0.875
639,Imaging,1,,,131.93,2010,2,0.285714,0.75
