In [1]:
import pandas as pd
import numpy as np

## Loading Data Into Pandas DataFrames

In [2]:
# example: loading the patients.csv file from the Massachusetts output folder
patients_ma = pd.read_csv('./output_ma/csv/patients.csv')

In [9]:
# display the first 5 rows
patients_ma.head(5)

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,MIDDLE,LAST,...,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,46780ba0-ce27-0fe2-6d9b-edbc9f806f46,2019-03-20,,999-97-7491,,,,Carroll471,Claribel706,Nolan344,...,Quincy,Massachusetts,Norfolk County,25021.0,2186,42.33029,-70.97801,20937.49,571.37,165325
1,5bfe5035-4142-4a70-c95b-04da612933e5,2016-08-24,,999-36-1936,,,,Elnora999,Felica690,Dare640,...,West Newbury,Massachusetts,Essex County,,0,42.797858,-71.003094,14041.48,16602.12,52690
2,04fbbd37-4ed3-bc81-8b0d-fe009da6ae23,2010-01-27,,999-43-1260,,,,Marilu588,Yolanda648,Blanco851,...,Georgetown,Massachusetts,Essex County,,0,42.700239,-70.972921,32734.87,15220.86,26408
3,7de648c7-1709-be32-c942-5f50dec468fd,2020-02-01,,999-74-1134,,,,Isaac321,Rudolf736,Zulauf375,...,Worthington,Massachusetts,Hampshire County,,0,42.356327,-72.903883,19370.52,1765.91,38627
4,06f1ee83-05f1-eb9a-9b78-40b4a8935da8,1998-11-28,,999-70-1906,S99989195,X58475593X,Ms.,Laurinda852,,Dicki44,...,Boston,Massachusetts,Suffolk County,25025.0,2199,42.37701,-70.970433,121925.91,0.0,35358


In [10]:
# display the columns in the DataFrame
patients_ma.columns

Index(['Id', 'BIRTHDATE', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX',
       'FIRST', 'MIDDLE', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'RACE',
       'ETHNICITY', 'GENDER', 'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE',
       'COUNTY', 'FIPS', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES',
       'HEALTHCARE_COVERAGE', 'INCOME'],
      dtype='object')

In [11]:
# display the columns and their datatypes
patients_ma.dtypes

Id                      object
BIRTHDATE               object
DEATHDATE               object
SSN                     object
DRIVERS                 object
PASSPORT                object
PREFIX                  object
FIRST                   object
MIDDLE                  object
LAST                    object
SUFFIX                  object
MAIDEN                  object
MARITAL                 object
RACE                    object
ETHNICITY               object
GENDER                  object
BIRTHPLACE              object
ADDRESS                 object
CITY                    object
STATE                   object
COUNTY                  object
FIPS                   float64
ZIP                      int64
LAT                    float64
LON                    float64
HEALTHCARE_EXPENSES    float64
HEALTHCARE_COVERAGE    float64
INCOME                   int64
dtype: object

In [12]:
# accessing the data in a particular column
patients_ma['FIRST']

0        Carroll471
1         Elnora999
2         Marilu588
3          Isaac321
4       Laurinda852
           ...     
2403       Ollie731
2404       Bryon392
2405        Todd315
2406      Dennis979
2407      Donnie175
Name: FIRST, Length: 2408, dtype: object

## Basic Data Manipulation with DataFrames

### Counting

In [30]:
# count the number of male and female patients
patients_ma['GENDER'].value_counts()

GENDER
M    1205
F    1203
Name: count, dtype: int64

### Descriptive Statistics

In [14]:
# compute the average healthcare expenses
patients_ma['HEALTHCARE_EXPENSES'].mean()

195597.66257475084

In [15]:
# accessing a larger suite of descriptive statistics
patients_ma['HEALTHCARE_EXPENSES'].describe()

count    2.408000e+03
mean     1.955977e+05
std      2.569421e+05
min      1.000000e+02
25%      2.332395e+04
50%      1.013151e+05
75%      2.503299e+05
max      1.885940e+06
Name: HEALTHCARE_EXPENSES, dtype: float64

### Filtering Rows and Columns

In [21]:
# Selecting only the BIRTHDATE, RACE, ETHNICITY, and SEX columns
subset = patients_ma[['BIRTHDATE', 'RACE', 'ETHNICITY', 'GENDER']]

In [22]:
subset

Unnamed: 0,BIRTHDATE,RACE,ETHNICITY,GENDER
0,2019-03-20,white,nonhispanic,F
1,2016-08-24,white,nonhispanic,F
2,2010-01-27,white,hispanic,F
3,2020-02-01,white,nonhispanic,M
4,1998-11-28,white,nonhispanic,F
...,...,...,...,...
2403,1923-11-28,white,nonhispanic,M
2404,1923-11-28,white,nonhispanic,M
2405,1923-11-28,white,nonhispanic,M
2406,1923-11-28,white,nonhispanic,M


In [23]:
# Selecting only patients who are female
females = patients_ma[patients_ma['GENDER'] == 'F'] # or patients_ma.query('GENDER == "F"')

In [24]:
females

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,MIDDLE,LAST,...,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,46780ba0-ce27-0fe2-6d9b-edbc9f806f46,2019-03-20,,999-97-7491,,,,Carroll471,Claribel706,Nolan344,...,Quincy,Massachusetts,Norfolk County,25021.0,2186,42.330290,-70.978010,20937.49,571.37,165325
1,5bfe5035-4142-4a70-c95b-04da612933e5,2016-08-24,,999-36-1936,,,,Elnora999,Felica690,Dare640,...,West Newbury,Massachusetts,Essex County,,0,42.797858,-71.003094,14041.48,16602.12,52690
2,04fbbd37-4ed3-bc81-8b0d-fe009da6ae23,2010-01-27,,999-43-1260,,,,Marilu588,Yolanda648,Blanco851,...,Georgetown,Massachusetts,Essex County,,0,42.700239,-70.972921,32734.87,15220.86,26408
4,06f1ee83-05f1-eb9a-9b78-40b4a8935da8,1998-11-28,,999-70-1906,S99989195,X58475593X,Ms.,Laurinda852,,Dicki44,...,Boston,Massachusetts,Suffolk County,25025.0,2199,42.377010,-70.970433,121925.91,0.00,35358
5,48de520c-98d2-c34a-120d-d8b9a9173bcd,1998-02-05,,999-66-8611,S99931688,X61450447X,Ms.,Julian715,Azzie965,Beahan375,...,Peabody,Massachusetts,Essex County,25009.0,1960,42.566324,-71.009472,782522.84,492598.03,48191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2376,f6cdb670-ef8e-b240-0ee7-22e8bfd2ced9,2007-05-31,,999-72-7858,S99980477,,,Shonna561,Stacey209,Kerluke267,...,Lee,Massachusetts,Berkshire County,25003.0,1238,42.265367,-73.229778,42077.62,13476.46,27543
2378,ece94bdf-e9c1-b103-e856-48f36c1c2e33,1999-10-27,,999-10-8493,S99913853,X54595216X,Ms.,Yun266,,Johns824,...,Boston,Massachusetts,Suffolk County,25025.0,2127,42.381922,-71.098051,89627.93,108596.18,166569
2382,073e0b9c-5dc0-1f6c-1967-c3708bb7eb5a,1941-10-09,,999-92-6109,S99957589,X24293347X,Mrs.,Neely59,Rebecka733,Lockman863,...,Lawrence,Massachusetts,Essex County,25009.0,1843,42.661309,-71.184608,822564.46,385715.03,87376
2387,1d29ba45-ec53-fcc2-aa24-e7e2535881b0,1981-05-23,,999-34-1281,S99989158,X2831604X,Mrs.,Brittni468,Alana17,Renner328,...,Rockport,Massachusetts,Essex County,25009.0,1966,42.684309,-70.609680,258942.13,494071.44,186139


In [28]:
# get patients born after 1990
patients_ma[patients_ma['BIRTHDATE'] >= '1990-01-01']

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,MIDDLE,LAST,...,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
0,46780ba0-ce27-0fe2-6d9b-edbc9f806f46,2019-03-20,,999-97-7491,,,,Carroll471,Claribel706,Nolan344,...,Quincy,Massachusetts,Norfolk County,25021.0,2186,42.330290,-70.978010,20937.49,571.37,165325
1,5bfe5035-4142-4a70-c95b-04da612933e5,2016-08-24,,999-36-1936,,,,Elnora999,Felica690,Dare640,...,West Newbury,Massachusetts,Essex County,,0,42.797858,-71.003094,14041.48,16602.12,52690
2,04fbbd37-4ed3-bc81-8b0d-fe009da6ae23,2010-01-27,,999-43-1260,,,,Marilu588,Yolanda648,Blanco851,...,Georgetown,Massachusetts,Essex County,,0,42.700239,-70.972921,32734.87,15220.86,26408
3,7de648c7-1709-be32-c942-5f50dec468fd,2020-02-01,,999-74-1134,,,,Isaac321,Rudolf736,Zulauf375,...,Worthington,Massachusetts,Hampshire County,,0,42.356327,-72.903883,19370.52,1765.91,38627
4,06f1ee83-05f1-eb9a-9b78-40b4a8935da8,1998-11-28,,999-70-1906,S99989195,X58475593X,Ms.,Laurinda852,,Dicki44,...,Boston,Massachusetts,Suffolk County,25025.0,2199,42.377010,-70.970433,121925.91,0.00,35358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2369,36a05364-9f82-8c3c-a89f-f070b0bb397c,2004-07-03,,999-19-7979,S99999765,X2421749X,Ms.,Candelaria844,Hettie215,Christiansen251,...,Lincoln,Massachusetts,Middlesex County,,0,42.455421,-71.304861,79943.78,847989.01,106226
2374,59d9d297-74fe-29d4-e0bd-e1df83bf849a,1991-01-17,,999-22-9811,S99986275,X40679841X,Mrs.,Launa267,Renita685,McKenzie376,...,Sherborn,Massachusetts,Middlesex County,,0,42.210696,-71.419405,134472.57,18846.76,39645
2376,f6cdb670-ef8e-b240-0ee7-22e8bfd2ced9,2007-05-31,,999-72-7858,S99980477,,,Shonna561,Stacey209,Kerluke267,...,Lee,Massachusetts,Berkshire County,25003.0,1238,42.265367,-73.229778,42077.62,13476.46,27543
2378,ece94bdf-e9c1-b103-e856-48f36c1c2e33,1999-10-27,,999-10-8493,S99913853,X54595216X,Ms.,Yun266,,Johns824,...,Boston,Massachusetts,Suffolk County,25025.0,2127,42.381922,-71.098051,89627.93,108596.18,166569


### Sorting

In [29]:
# get patients born after 1990, and sort by birthdate
patients_ma[patients_ma['BIRTHDATE'] >= '1990-01-01'].sort_values(by='BIRTHDATE')

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,MIDDLE,LAST,...,CITY,STATE,COUNTY,FIPS,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,INCOME
1684,29b01929-38bb-3061-0ffb-6465e74cd562,1990-02-05,,999-48-9795,S99917585,X38711068X,Mr.,Thurman577,Mark765,Rodriguez71,...,Boston,Massachusetts,Suffolk County,25025.0,2127,42.345043,-71.046470,87011.77,27911.04,58790
1178,416c4ad9-6d1e-9e86-de14-ad32131a8151,1990-02-11,,999-60-6542,S99913832,X45317331X,Mr.,Craig656,Victor265,Emmerich580,...,Danvers,Massachusetts,Essex County,25009.0,1923,42.623495,-70.977659,88999.54,19321.59,60068
1176,7f974b69-e4b9-5d25-3db5-11c63f39b737,1990-02-24,,999-69-4072,S99964553,X49149018X,Mr.,Bennie663,Wally311,Smith67,...,Tewksbury,Massachusetts,Middlesex County,,0,42.635723,-71.263745,82738.30,25320.72,46872
1165,b464fdb9-5e22-1c32-f1b4-a6614ddadc10,1990-02-24,1991-08-13,999-31-1381,,,,Warren653,Oliver401,Nicolas769,...,Tewksbury,Massachusetts,Middlesex County,,0,42.596540,-71.252044,9524.83,10191.92,46872
1454,c6fbfd1c-f3ad-7064-530f-2294ecd34c3b,1990-03-05,,999-69-2956,S99999161,X28184093X,Mrs.,Danette111,Kyung736,Bashirian201,...,Peabody,Massachusetts,Essex County,25009.0,1940,42.536428,-70.927714,10483.19,525270.91,15585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,dbf4f095-8a24-36af-20f2-a42539d3a0ad,2024-09-10,,999-91-2121,,,,Larry532,Whitney250,Buckridge80,...,Randolph,Massachusetts,Norfolk County,25021.0,2368,42.214425,-71.050241,100.00,2975.31,23038
1693,5d866d4c-4cec-dd1c-97c7-ae8a33fa1ffb,2024-10-26,,999-96-4175,,,,Willis868,Bernardo699,Wehner319,...,Weymouth,Massachusetts,Norfolk County,25021.0,2190,42.211073,-70.954914,1237.70,0.00,49442
412,e02c7c61-481f-9bf4-d892-79e90f5bfa64,2024-12-03,,999-34-5287,,,,Bennett146,Sebastian508,Johns824,...,Milton,Massachusetts,Norfolk County,25021.0,2186,42.226589,-71.096970,620.18,0.00,376406
865,3c972dc6-715c-51ce-2063-f02b1b387b85,2025-01-09,,999-59-7083,,,,Christia477,Lakeisha206,Breitenberg711,...,Medford,Massachusetts,Middlesex County,25017.0,2145,42.420141,-71.136372,347.38,0.00,42149


### Assigning New Columns/Variables

In [41]:
# assign a full name column
patients_ma['FULLNAME'] = patients_ma['FIRST'] + ' ' + patients_ma['MIDDLE'] + ' ' + patients_ma['LAST']

In [42]:
patients_ma['FULLNAME']

0       Carroll471 Claribel706 Nolan344
1           Elnora999 Felica690 Dare640
2        Marilu588 Yolanda648 Blanco851
3          Isaac321 Rudolf736 Zulauf375
4                                   NaN
                     ...               
2403         Ollie731 Evan94 Quigley282
2404        Bryon392 Jamar151 Becker968
2405       Todd315 Kendall673 Jacobs452
2406        Dennis979 Billy698 Ferry570
2407     Donnie175 Neville893 Zemlak964
Name: FULLNAME, Length: 2408, dtype: object

## Quick Exercises

### 1. What is the median income of patients born after January 1st 1995

In [1]:
# Write and run your solution here

### 2. Count the number of patients in each COUNTY

In [2]:
# Write and run your solution here

### 3. What is the mean age (in years) of all patients?
#### Hint: You can convert BIRTHDATE to the datetime data type using pd.to_datetime()
#### Hint 2: You can get today's date with pd.to_datetime("today")
#### Hint 3: You can use the .dt.days method on a datetime column/series to get the value of the date expressed in days

In [3]:
# Write and run your solution here

### 4. Count the number of patients of each RACE who are over 60
#### Hint: If you didn't save the AGE that we computed in the last exercise to a column/variable, do that first

In [4]:
# Write and run your solution here