# US Medical Insurance Project
## Author: Gian Baldonado
Purpose: Investigate US medical insurance cost with core Python (no external libraries). The goal is to analyze the attributes of the **insurance.csv** file to learn more about general patient information. 

In [90]:
import csv
import math

My approach is to create lists containing the data for each attribute in the csv file, then pass those lists as arguments to a Patients class that can then perform various analyses. The insurance.csv contains the following attributes:

* Patient age
* Patient sex
* Patient bmi
* Patient number of children
* Patient smoking status
* Patient region
* Patient annual insurance cost

In [91]:
#Create empty lists for the various attributes in insurance.csv
patient_ages = []
patient_sexes = []
patient_bmis = []
patient_num_children = []
patient_smoker_statuses = []
patient_regions = []
patient_insurance_charges = []

In [92]:
#Helper unction to help load data from csv to the lists
def load_lists(csv_file, col_name, lst):
    with open("insurance.csv") as insurance_file:
        insurance_content = csv.DictReader(insurance_file)
        
        for patient_row in insurance_content:
            lst.append(patient_row[col_name])
        
        return lst

In [93]:
load_lists('insurance_csv', 'age', patient_ages)
load_lists('insurance_csv', 'sex', patient_sexes)
load_lists('insurance_csv', 'bmi', patient_bmis)
load_lists('insurance_csv', 'children', patient_num_children)
load_lists('insurance_csv', 'smoker', patient_smoker_statuses)
load_lists('insurance_csv', 'region', patient_regions)
load_lists('insurance_csv', 'charges', patient_insurance_charges)

['16884.924',
 '1725.5523',
 '4449.462',
 '21984.47061',
 '3866.8552',
 '3756.6216',
 '8240.5896',
 '7281.5056',
 '6406.4107',
 '28923.13692',
 '2721.3208',
 '27808.7251',
 '1826.843',
 '11090.7178',
 '39611.7577',
 '1837.237',
 '10797.3362',
 '2395.17155',
 '10602.385',
 '36837.467',
 '13228.84695',
 '4149.736',
 '1137.011',
 '37701.8768',
 '6203.90175',
 '14001.1338',
 '14451.83515',
 '12268.63225',
 '2775.19215',
 '38711',
 '35585.576',
 '2198.18985',
 '4687.797',
 '13770.0979',
 '51194.55914',
 '1625.43375',
 '15612.19335',
 '2302.3',
 '39774.2763',
 '48173.361',
 '3046.062',
 '4949.7587',
 '6272.4772',
 '6313.759',
 '6079.6715',
 '20630.28351',
 '3393.35635',
 '3556.9223',
 '12629.8967',
 '38709.176',
 '2211.13075',
 '3579.8287',
 '23568.272',
 '37742.5757',
 '8059.6791',
 '47496.49445',
 '13607.36875',
 '34303.1672',
 '23244.7902',
 '5989.52365',
 '8606.2174',
 '4504.6624',
 '30166.61817',
 '4133.64165',
 '14711.7438',
 '1743.214',
 '14235.072',
 '6389.37785',
 '5920.1041',
 '176

In [94]:
#Convert string to ints
def cast_type(lst, datatype):
    new_lst = []
    for strng in lst:
        if(datatype == 'int'):
            new_lst.append(int(strng))
        elif (datatype == 'float'):
            new_lst.append(float(strng))
    return new_lst

#Test
a = ["1", "2", "3"]
cast_type(a, "float")

[1.0, 2.0, 3.0]

In [96]:
#Helper function for average
average = lambda lst: round(sum(lst)/len(lst),2)

#For sample variance
var = lambda lst: sum([(x - average(lst)) ** 2 for x in lst]) / (len(lst) - 1)

#For sample stdev
stdev = lambda lst: round(math.sqrt(var(lst)), 2)

#Test
a = [1,2,3,4,5]
print(f'average: {average(a)}, sample variance: {var(a)}, sample standard deviation: {stdev(a)}')

average: 3.0, sample variance: 2.5, sample standard deviation: 1.58


I built a class called **Patients** with the following functions to help us for analysis:
   * analyze_ages()
   * analyze_sexes()
   * analyze_bmi()
   * get_regions()
   * calculate_stats_insurance_charges()
   * create_patient_table() - allows us to use the following functions:
       * calculate_charges_stats_by_age
       * calculate_charges_stats_by_sexes
       * calculate_charges_stats_by_bmis
       * calculate_charges_stats_by_smoker
       * calculate_charges_stats_by_region
       

In [430]:
class Patients:
    def __init__(self, ages, sexes, bmis, num_children, smoker_statuses, regions, insurance_charges):
        self.ages = ages
        self.sexes = sexes
        self.bmis = bmis
        self.num_children = num_children
        self.smoker_statuses = smoker_statuses
        self.regions = regions
        self.insurance_charges = insurance_charges
        
    def analyze_ages(self):
        #Convert strings to ints
        int_ages = cast_type(self.ages, "int")
        
        #Calculate average, sample variance, and sample stdev
        ages_average = average(int_ages)
        var(int_ages)
        ages_stdev = stdev(int_ages)
        
        return (f'Average patient age: {ages_average}, Standard deviation: {ages_stdev}')
    
    def analyze_sexes(self):
        females = 0
        males = 0
        
        for sex in self.sexes:
            if sex == 'female': 
                females+=1
            elif sex == 'male': 
                males+=1
            
        return(f'Count for female: {females}, Count for males: {males}')
    
    def analyze_bmis(self):
        #Convert strings to floats
        float_bmis = cast_type(self.bmis, "float")
        
        bmis_average = average(float_bmis)
        var(float_bmis)
        bmis_stdev = stdev(float_bmis)
        
        return (f'Average patient bmi: {bmis_average}, Standard deviation: {bmis_stdev}')
    
    def get_regions(self):
        regions = []
    
        for region in self.regions:
            if region not in regions:
                regions.append(region)
        
        return regions
    
    def calculate_stats_insurance_charges(self):
        #Convert charges to float
        float_charges = cast_type(self.insurance_charges, 'float')
        
        charges_average = average(float_charges)
        var(float_charges)
        charges_stdev = stdev(float_charges)
        
        return (f'Average patient charge: $ {charges_average}, Standard deviation: $ {charges_stdev}')
    
    
    def create_patient_table(self, c): 
        self.patients_dictionary = {}
        patient_id_list = []
        patient_info_list = []
        
        with open(c, "r") as file:
            content = csv.DictReader(file)
            id = 1
            
            for row in content:
                patient_id_list.append(f'Patient {id}')
                patient_info_list.append(row)
                id+=1
        
        zipped_patient_id_info = zip(patient_id_list, patient_info_list)
        self.patients_dictionary = {key:value for key, value in zipped_patient_id_info}
        return self.patients_dictionary
    
    ### More analysis functions
    def calculate_charges_stats_by_age(self, age_groups):
        stats = {} 
        ages = [] #list of all ages
        charges = [] #list of all charges
        age_categories = [] #list of age groupings
        
        #Create list of age categories from age_groups
        for key in age_groups.keys():
            age_categories.append(key)
            #Append an empty array for each category in ages and charges
            ages.append([])
            charges.append([])
            
        # print(age_categories)
            
        #Gather all ages and corresponding charges to calculate average
        for key,value in self.patients_dictionary.items():
            age = int(value.get("age"))
            cost = float(value.get("charges"))
        
            for i in range(len(age_categories)):
                if age in age_groups[age_categories[i]]:
                    ages[i].append(age)
                    charges[i].append(cost)
            
        # print(ages)
        
        #Make stats dic
        for i in range(len(age_categories)):
            stats[age_categories[i]] = {"average_costs": average(charges[i]),
                                         "stdev_costs": stdev(charges[i])
            }
        
        return stats
    
    def calculate_charges_stats_by_sexes(self):
        stats = {} 
        sexes = [] 
        charges = [] 
        sex_categories = [] 
        
        #Create list of sex categories and append appropriate sex and charge in the right array of sexes and charges respectively
        for key, value in self.patients_dictionary.items():
            sex = value.get("sex")
            cost = float(value.get("charges"))
            
            if sex not in sex_categories:
                sex_categories.append(sex)
                sexes.append([])
                charges.append([])
            
            if sex == 'female':
                sexes[0].append(sex)
                charges[0].append(cost)
            elif sex == 'male':
                sexes[1].append(sex)
                charges[1].append(cost)

        #Make stats dic
        for i in range(len(sex_categories)):
            stats[sex_categories[i]] = {"average_costs": average(charges[i]),
                                         "stdev_costs": stdev(charges[i])
            }
        
        return stats
    
    def calculate_charges_stats_by_bmis(self):
        bmi_definitions = { "underweight": 18.5, "healthy": 24.9,"overweight": 29.9,"obesity": 30.0}
        stats = {}
        bmis = []
        charges = []
        bmi_categories = []
        
        for key in bmi_definitions.keys():
            bmi_categories.append(key)
            bmis.append([])
            charges.append([])
    
        for key, value in self.patients_dictionary.items():
            bmi = float(value.get("bmi"))
            cost = float(value.get("charges"))

            if bmi > bmi_definitions["obesity"]:
                    bmis[3].append(bmi)
                    charges[3].append(cost)

            for i in range(len(bmi_categories)):
                if bmi < bmi_definitions[bmi_categories[i]]:
                    bmis[i].append(bmi)
                    charges[i].append(cost)
    
        for i in range(len(bmi_categories)):
            stats[bmi_categories[i]] = {"average_costs": average(charges[i]),
                                        "stdev_costs": stdev(charges[i]),
                                        }
        return stats
    
    def calculate_charges_stats_by_smoker(self):
        stats = {} 
        smoker_statuses = [] 
        charges = [] 
        smoker_categories = [] 
        
        #Create list of sex categories and append appropriate sex and charge in the right array of sexes and charges respectively
        for key, value in self.patients_dictionary.items():
            smoker = value.get("smoker")
            cost = float(value.get("charges"))
            
            if smoker not in smoker_categories:
                smoker_categories.append(smoker)
                smoker_statuses.append([])
                charges.append([])
            
            if smoker == 'yes':
                smoker_statuses[0].append(smoker)
                charges[0].append(cost)
            elif smoker == 'no':
                smoker_statuses[1].append(smoker)
                charges[1].append(cost)

        #Make stats dic
        for i in range(len(smoker_categories)):
            stats[smoker_categories[i]] = {"average_costs": average(charges[i]),
                                         "stdev_costs": stdev(charges[i])
            }
        
        return stats
    
    def calculate_charges_stats_by_region(self):
        stats = {} 
        regions = [] 
        charges = [] 
        region_categories = self.get_regions()
        
        #Create list of sex categories and append appropriate sex and charge in the right array of sexes and charges respectively
        for key, value in self.patients_dictionary.items():
            region = value.get("region")
            cost = float(value.get("charges"))
            
            for i in range(len(region_categories)):
                regions.append([])
                charges.append([])
            
            for i in range(len(region_categories)):
                if region == region_categories[i]:
                    regions[i].append(region)
                    charges[i].append(cost)
        
        #Make stats dic
        for i in range(len(region_categories)):
            stats[region_categories[i]] = {"average_costs": average(charges[i]),
                                         "stdev_costs": stdev(charges[i])
            }
        
        return stats

    

Create instance of the class called **patient_info**

In [410]:
patient_info = Patients(patient_ages, patient_sexes, patient_bmis, patient_num_children, patient_smoker_statuses, patient_regions, patient_insurance_charges)

## Analysis

In [405]:
patient_info.analyze_ages()

'Average patient age: 39.21, Standard deviation: 14.05'

The average age of the patients in the sample is 39 years old plus or minus 14.05 years (1 standard deviation away from the mean). This important to check to ensure that our sample is representative of the broader population.

In [421]:
patient_info.analyze_sexes()

'Count for female: 662, Count for males: 676'

There is about a balance number of females and males in this study, which is good.

In [422]:
patient_info.analyze_bmis()

'Average patient bmi: 30.66, Standard deviation: 6.1'

The average patient has a BMI of 30.66, which is considered overweight in the BMI scale.

In [423]:
patient_info.calculate_stats_insurance_charges()

'Average patient charge: $ 13270.42, Standard deviation: $ 12110.01'

Average medical cost for a patient in the US is $ 13,270.42. What attributes contribute strongly to these high/low medical insurance charges? Check out the further analyses below.

In [415]:
patient_info.create_patient_table("insurance.csv")

{'Patient 1': {'age': '19',
  'sex': 'female',
  'bmi': '27.9',
  'children': '0',
  'smoker': 'yes',
  'region': 'southwest',
  'charges': '16884.924'},
 'Patient 2': {'age': '18',
  'sex': 'male',
  'bmi': '33.77',
  'children': '1',
  'smoker': 'no',
  'region': 'southeast',
  'charges': '1725.5523'},
 'Patient 3': {'age': '28',
  'sex': 'male',
  'bmi': '33',
  'children': '3',
  'smoker': 'no',
  'region': 'southeast',
  'charges': '4449.462'},
 'Patient 4': {'age': '33',
  'sex': 'male',
  'bmi': '22.705',
  'children': '0',
  'smoker': 'no',
  'region': 'northwest',
  'charges': '21984.47061'},
 'Patient 5': {'age': '32',
  'sex': 'male',
  'bmi': '28.88',
  'children': '0',
  'smoker': 'no',
  'region': 'northwest',
  'charges': '3866.8552'},
 'Patient 6': {'age': '31',
  'sex': 'female',
  'bmi': '25.74',
  'children': '0',
  'smoker': 'no',
  'region': 'southeast',
  'charges': '3756.6216'},
 'Patient 7': {'age': '46',
  'sex': 'female',
  'bmi': '33.44',
  'children': '1',
 

In [424]:
age_definitions = {"young_adults": range(18,36), "middle_adults": range(35, 55),"older_adults": range(56, 120)}
patient_info.calculate_charges_stats_by_age(age_definitions)

{'young_adults': {'average_costs': 9744.48, 'stdev_costs': 11579.68},
 'middle_adults': {'average_costs': 14561.17, 'stdev_costs': 11823.25},
 'older_adults': {'average_costs': 18795.99, 'stdev_costs': 11482.26}}

We can interpret from the data above that the older the person, the higher the average insurance cost. Some older adults pay about double the amount of some younger adults pay.

In [426]:
patient_info.calculate_charges_stats_by_sexes()

{'female': {'average_costs': 12569.58, 'stdev_costs': 11128.7},
 'male': {'average_costs': 13956.75, 'stdev_costs': 12971.03}}

From this sample, male patients pay $1000 more on average compared to female patients.

In [427]:
patient_info.calculate_charges_stats_by_bmis()

{'underweight': {'average_costs': 8852.2, 'stdev_costs': 7735.04},
 'healthy': {'average_costs': 10253.28, 'stdev_costs': 7515.74},
 'overweight': {'average_costs': 10726.54, 'stdev_costs': 7836.29},
 'obesity': {'average_costs': 13271.54, 'stdev_costs': 12113.32}}

Patients who are obese have the highest average insurance cost, while patients who are considered overweight and healthy pay about roughly the same on average. People who are underweight pay the least on average.

In [428]:
patient_info.calculate_charges_stats_by_smoker()

{'yes': {'average_costs': 32050.23, 'stdev_costs': 11541.55},
 'no': {'average_costs': 8434.27, 'stdev_costs': 5993.78}}

Smokers pay about roughly 25% than their non-smoker counterparts in insurance costs.

In [429]:
patient_info.calculate_charges_stats_by_region()

{'southwest': {'average_costs': 12346.94, 'stdev_costs': 11557.18},
 'southeast': {'average_costs': 14735.41, 'stdev_costs': 13971.1},
 'northwest': {'average_costs': 12417.58, 'stdev_costs': 11072.28},
 'northeast': {'average_costs': 13406.38, 'stdev_costs': 11255.8}}

Regions in the northwest and southwest pay roughly the same average costs, while regions in the northeast and southeast pay about $1000 more. 