# Table of Contents

0.1 Import Libraries

0.2 Import Data: donors-census.pkl

0.3 Explore Original Dataframes

0.4 Enriching data
   
0.5 Export New Dataframe: donors-census-plus.pkl



### 0.1 Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

### 0.2 Import Data: donors-census.pkl


In [2]:
# Identify the file pathway to data files
path = r'C:\Users\CJ\Documents\_CJ-Stuff\Career Foundry\Data Immersion\Ach 6 - Adv Analytics and Dashboard\Donate Life Project'

In [3]:
# Import data
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'donors-census.pkl'))

### 0.3 Explore Original Dataframes

#### Confirming shape, column names, datatyupes, and overall 'look' of data

In [4]:
df.shape

(56000, 10)

In [5]:
df.head()

Unnamed: 0,population,year,state,age_group,gender,ethnicity,key,living_donors,deceased_donors,total_donors
0,96.0,2013,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2013-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0
1,123.0,2014,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2014-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0
2,78.0,2015,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2015-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0
3,85.0,2016,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2016-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0
4,95.0,2017,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2017-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0


In [6]:
df.tail()

Unnamed: 0,population,year,state,age_group,gender,ethnicity,key,living_donors,deceased_donors,total_donors
55995,45781.0,2018,Wyoming,65+,Female,White (Non-Hispanic),2018-Wyoming-65+-Female-White(Non-Hispanic),1.0,1.0,2.0
55996,47316.0,2019,Wyoming,65+,Female,White (Non-Hispanic),2019-Wyoming-65+-Female-White(Non-Hispanic),0.0,0.0,0.0
55997,47690.0,2020,Wyoming,65+,Female,White (Non-Hispanic),2020-Wyoming-65+-Female-White(Non-Hispanic),0.0,0.0,0.0
55998,49349.0,2021,Wyoming,65+,Female,White (Non-Hispanic),2021-Wyoming-65+-Female-White(Non-Hispanic),0.0,0.0,0.0
55999,50885.0,2022,Wyoming,65+,Female,White (Non-Hispanic),2022-Wyoming-65+-Female-White(Non-Hispanic),0.0,0.0,0.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56000 entries, 0 to 55999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   population       56000 non-null  float64 
 1   year             56000 non-null  int16   
 2   state            56000 non-null  category
 3   age_group        56000 non-null  category
 4   gender           56000 non-null  category
 5   ethnicity        56000 non-null  category
 6   key              56000 non-null  object  
 7   living_donors    56000 non-null  float64 
 8   deceased_donors  56000 non-null  float64 
 9   total_donors     56000 non-null  float64 
dtypes: category(4), float64(4), int16(1), object(1)
memory usage: 2.9+ MB


In [8]:
df.describe()

Unnamed: 0,population,year,living_donors,deceased_donors,total_donors
count,56000.0,56000.0,56000.0,56000.0,56000.0
mean,57996.9,2017.5,1.091036,1.936482,3.027518
std,163215.5,2.872307,5.079285,7.377671,11.684521
min,0.0,2013.0,0.0,0.0,0.0
25%,929.0,2015.0,0.0,0.0,0.0
50%,5414.5,2017.5,0.0,0.0,0.0
75%,32426.5,2020.0,0.0,1.0,1.0
max,2247291.0,2022.0,98.0,139.0,171.0


Data appears as expected.

#### Checking for missing values, duplicates & mixed data types

In [9]:
# Checking for nulls across the df
df.isnull().sum()

population         0
year               0
state              0
age_group          0
gender             0
ethnicity          0
key                0
living_donors      0
deceased_donors    0
total_donors       0
dtype: int64

In [10]:
# Check for mixed data types
for col in df.columns.tolist():
      weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
      if len (df[weird]) > 0:
        print (col)

In [11]:
dups = df.duplicated()
dups.shape

(56000,)

No nulls, mixed data types, or duplicates found.

### 0.4 Enriching data

Extract summed data grouped by year for population, living donors, and deceased donors.

In [95]:
for yr in range (2013, 2023):
    df.loc[df['year'] == yr, 'natl_pop'] = df[df['year'] == yr].population.sum()
    df.loc[df['year'] == yr, 'natl_liv_don'] = df[df['year'] == yr].living_donors.sum()
    df.loc[df['year'] == yr, 'natl_dec_don'] = df[df['year'] == yr].deceased_donors.sum()

Confirming that the new columns have the right data and counts

In [102]:
df['natl_pop'].value_counts()

315343134.0    5600
317638680.0    5600
319959763.0    5600
322255496.0    5600
324290633.0    5600
325985954.0    5600
327533774.0    5600
330840644.0    5600
331362763.0    5600
332615754.0    5600
Name: natl_pop, dtype: int64

In [103]:
df['natl_liv_don'].value_counts()

5813.0    5600
5606.0    5600
5768.0    5600
5773.0    5600
6001.0    5600
6646.0    5600
7202.0    5600
5593.0    5600
6400.0    5600
6296.0    5600
Name: natl_liv_don, dtype: int64

In [104]:
df['natl_dec_don'].value_counts()

8120.0     5600
8439.0     5600
8920.0     5600
9825.0     5600
10129.0    5600
10547.0    5600
11693.0    5600
12419.0    5600
13666.0    5600
14685.0    5600
Name: natl_dec_don, dtype: int64

In [105]:
df.head(30)

Unnamed: 0,population,year,state,age_group,gender,ethnicity,key,living_donors,deceased_donors,total_donors,natl_pop,natl_liv_don,natl_dec_don
0,96.0,2013,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2013-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,315343134.0,5813.0,8120.0
1,123.0,2014,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2014-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,317638680.0,5606.0,8439.0
2,78.0,2015,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2015-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,319959763.0,5768.0,8920.0
3,85.0,2016,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2016-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,322255496.0,5773.0,9825.0
4,95.0,2017,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2017-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,324290633.0,6001.0,10129.0
5,131.0,2018,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2018-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,325985954.0,6646.0,10547.0
6,165.0,2019,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2019-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,327533774.0,7202.0,11693.0
7,114.0,2020,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2020-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,330840644.0,5593.0,12419.0
8,55.0,2021,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2021-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,331362763.0,6400.0,13666.0
9,60.0,2022,Alabama,< 1 Year,Male,American Indian/Alaska Native (Non-Hispanic),2022-Alabama-<1Year-Male-AmericanIndian/Alaska...,0.0,0.0,0.0,332615754.0,6296.0,14685.0


# 0.5 Export New Dataframe

In [106]:
# Confirming final shape and datetypes
df.shape

(56000, 13)

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56000 entries, 0 to 55999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   population       56000 non-null  float64 
 1   year             56000 non-null  int16   
 2   state            56000 non-null  category
 3   age_group        56000 non-null  category
 4   gender           56000 non-null  category
 5   ethnicity        56000 non-null  category
 6   key              56000 non-null  object  
 7   living_donors    56000 non-null  float64 
 8   deceased_donors  56000 non-null  float64 
 9   total_donors     56000 non-null  float64 
 10  natl_pop         56000 non-null  float64 
 11  natl_liv_don     56000 non-null  float64 
 12  natl_dec_don     56000 non-null  float64 
dtypes: category(4), float64(7), int16(1), object(1)
memory usage: 6.2+ MB


In [108]:
# Export df as a pickle file for future analysis in Python
df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'donors-census-plus.pkl'))

In [109]:
# Export a copy of the df as .csv that can be opened in Excel
df.to_csv(os.path.join(path, '02 Data','Prepared Data', 'donors-census-plus.csv'), index = False)