In [32]:
import pandas as pd
import numpy as np

In this subsection I am going to search if there are missing data (NaN, None).

In [33]:
def missing_values(df):
    # Identifying the number of missing values in each column:
    missing_values = df.isnull().sum()

    # Identifying columns WITH missing values:
    columns_with_nulls = missing_values[missing_values > 0].index

    # DataFrame for columns with missing values and their count:
    missing_values_df = missing_values[columns_with_nulls].reset_index()
    missing_values_df.columns = ['Column', 'Missing Values']
    
    # Percent of data that is missing:
    total_cells = np.product(df.shape)
    total_missing = missing_values.sum()
    percent_missing = (total_missing/total_cells)*100
    
    print("\n DataFrame with columns having missing values and their count of missing values:")
    print(missing_values_df)
    print(f"\n The number of columns with missing data is {len(columns_with_nulls)}. That is a {percent_missing.round(2)} %.")

---------------------
Consider why there are missing values. If it's necessary:
- You can **drop** the rows/columns with missing values. 
- Another option is **filling in** the missing values **with zeros**.
- Another option could be **replace missing values** with whatever value comes directly after in the same column.

In [34]:
# Dropping columns and/or rows with missing data and view how much data we miss:
def treat_missing_values(df, drop_columns, drop_rows, missing_data_to_zeros, replace):
    original_shape = df.shape
    
    # remove all the columns that contain a missing value
    if drop_columns == True:
        df = df.dropna(axis=0)
        # just how much data did we lose?
        print("Columns in original dataset: %d \n" % original_shape[1])
        print("Columns with na's dropped: %d \n" % df.shape[1])

    # remove all rows with at least one missing value
    if drop_rows == True:
        df = df.dropna()
        # just how much data did we lose?
        print("Rows in original dataset: %d \n" % original_shape[0])
        print("Rows with na's dropped: %d \n" % df.shape[0])
        
    if missing_data_to_zeros == True:
        df = df.fillna(0)
    
    if replace == True:
        df = df.fillna(method='bfill', axis=0).fillna(0)
        
    return df

## Example

In [35]:
# Reading in all our data:
food = pd.read_csv('food_coded.csv')

# Looking at the first five rows of the data:
food.head()

Unnamed: 0,GPA,Gender,breakfast,calories_chicken,calories_day,calories_scone,coffee,comfort_food,comfort_food_reasons,comfort_food_reasons_coded,...,soup,sports,thai_food,tortilla_calories,turkey_calories,type_sports,veggies_day,vitamins,waffle_calories,weight
0,2.4,2,1,430,,315.0,1,none,we dont have comfort,9.0,...,1.0,1.0,1,1165.0,345,car racing,5,1,1315,187
1,3.654,1,1,610,3.0,420.0,2,"chocolate, chips, ice cream","Stress, bored, anger",1.0,...,1.0,1.0,2,725.0,690,Basketball,4,2,900,155
2,3.3,1,1,720,4.0,420.0,2,"frozen yogurt, pizza, fast food","stress, sadness",1.0,...,1.0,2.0,5,1165.0,500,none,5,1,900,I'm not answering this.
3,3.2,1,1,430,3.0,420.0,2,"Pizza, Mac and cheese, ice cream",Boredom,2.0,...,1.0,2.0,5,725.0,690,,3,1,1315,"Not sure, 240"
4,3.5,1,1,720,2.0,420.0,2,"Ice cream, chocolate, chips","Stress, boredom, cravings",1.0,...,1.0,1.0,4,940.0,500,Softball,4,2,760,190


In [36]:
missing_values(food)


 DataFrame with columns having missing values and their count of missing values:
                        Column  Missing Values
0                          GPA               2
1                 calories_day              19
2               calories_scone               1
3                 comfort_food               1
4         comfort_food_reasons               1
5   comfort_food_reasons_coded              19
6                         cook               3
7                      cuisine              17
8                 diet_current               1
9                        drink               2
10              eating_changes               3
11                  employment               9
12                    exercise              13
13            father_education               1
14           father_profession               3
15                 fav_cuisine               2
16                    fav_food               2
17              food_childhood               1
18                healthy

In [37]:
food_without_nans = treat_missing_values(food, 
                                         drop_columns=True, 
                                         drop_rows=True, 
                                         missing_data_to_zeros=False, 
                                         replace=False)


Columns in original dataset: 61 

Columns with na's dropped: 61 

Rows in original dataset: 125 

Rows with na's dropped: 52 



In [38]:
missing_values(food_without_nans)


 DataFrame with columns having missing values and their count of missing values:
Empty DataFrame
Columns: [Column, Missing Values]
Index: []

 The number of columns with missing data is 0. That is a 0.0 %.
