# U.S. Medical Insurance Costs

In [355]:
# Open dataset
with open('insurance.csv') as insurance_file:
    # Read the first line to have a look at what data
    # is available to us
    print(insurance_file.readline())
    # age,sex,bmi,children,smoker,region,charges

age,sex,bmi,children,smoker,region,charges



# Project objectives and scope
## Objectives
Try find something interesting in the dataset

As I'm not really sure what could be interesting in the dataset just yet, I'm going to pull some general metrics to see if there is some anomaly that stands out

## Path
- Open the data file
- Extract entries into a dictionary
- Process calculations into functions
  - Average cost of insurance
  - Average cost based on gender
  - Total cost based on age range 0 -20, 20 -30, 30 - 40, 40 - 50, 50+
  - Regions with the highest insurance costs
  - Average cost saving if no longer smoking
- Write up a meaningful report on findings

In [356]:
# Plan of action for data to be extracted

total_entries = 0
total_charges = 0
gender_charges = {
    "male":0,
    "female":0
}
age_charges = {}
region_charges = {}
smokes_vs_non_charges = {
    "yes":0,
    "no":0
}

# Plan of action for calculating results

def avgInsuranceCost(total_entries, total_insurance_cost):
    return total_insurance_cost / total_entries

def avgInsuranceCostByGender(gender_charges,total_entries):
    # key (Gender) += value + charges
    return {
        "male": gender_charges["male"] / total_entries,
        "female": gender_charges["female"] / total_entries,
    }
def totalInsuranceCostByAgeRange(age_charges):

    ranges = {
        "0 - 19": 0,
        "20 - 29": 0,
        "30 - 39": 0,
        "40 - 49": 0,
        "50+": 0,
    }
    for age in age_charges:
        if(age > 0 and age <= 19): ranges["0 - 19"] += age_charges[age]
        elif(age > 20 and age <= 29): ranges["20 - 29"] += age_charges[age]
        elif(age > 30 and age <= 39): ranges["30 - 39"] += age_charges[age]
        elif(age > 40 and age <= 49): ranges["40 - 49"] += age_charges[age]
        else: ranges["50+"] += age_charges[age]
    return ranges
def highestInsuranceRegion(region_charges):
    max = 0
    region = 0
    for key,value in region_charges.items():
        if(value > max):
            max = value
            region = key
    return {
        "region" : region,
        "insurance_cost": max
    }

def avgSavingIfStopSmoking(smokes_vs_non_charges, total_entries):
    return (smokes_vs_non_charges["yes"] - smokes_vs_non_charges["no"]) / total_entries


In [357]:
# Open dataset
import csv
# Adding the fields we want to calculate here just as a reminder
# so we don't need to keep scrolling up and down
# total_entries = 0
# total_charges = 0
# gender_charges = {}
# age_charges = {}
# region_charges = {}
# smokes_vs_non_charges = {}

with open('insurance.csv') as insurance_file:
    # Read the first line to have a look at what data
    # is available to us
    #print(insurance_file.readline())
    insurance_data = csv.DictReader(insurance_file)
    # age,sex,bmi,children,smoker,region,charges
    # Going to pre-calculate and store data we actually need
    # While looping through file in order to
    # optimize resource usage
    for entry in insurance_data:
        total_entries += 1
        charges = float(entry["charges"])
        total_charges += charges
        gender_charges[entry["sex"]] += charges
        age = int(entry["age"])
        if age not in age_charges:
            age_charges[age] = charges
        else:
            age_charges[age] += charges
        if entry["region"] not in region_charges:
            region_charges[entry["region"]] = charges
        else:
            region_charges[entry["region"]] += charges
        smokes_vs_non_charges[entry["smoker"]] += charges

In [358]:
# Now we calculate and print out our meaningful data

print("The average insurance cost is {} from total charges of {} in a sample of {} people".format(round(avgInsuranceCost(total_entries, total_charges),2), round(total_charges,2), total_entries))

average_per_gender = avgInsuranceCostByGender(gender_charges, total_entries)
print("The average cost of insurance per gender is male : {} female: {}".format(round(average_per_gender["male"],2),round(average_per_gender["female"],2)))

charges_by_age_group = totalInsuranceCostByAgeRange(age_charges)
for key,value in charges_by_age_group.items():
    print("Total cost per age range {} is {}".format(key,round(value,2)))

highestRegion = highestInsuranceRegion(region_charges)
print("The region with the highest paying insurance is {} who paid a total of {} ".format(highestRegion["region"],round(highestRegion["insurance_cost"],2)))

print("On average smokers where paying {} more then non smokers in our sample data".format(round(avgSavingIfStopSmoking(smokes_vs_non_charges, total_entries),2)))

The average insurance cost is 13270.42 from total charges of 17755824.99 in a sample of 1338 people
The average cost of insurance per gender is male : 7051.39 female: 6219.03
Total cost per age range 0 - 19 is 1151806.85
Total cost per age range 20 - 29 is 2382659.05
Total cost per age range 30 - 39 is 2673451.54
Total cost per age range 40 - 49 is 3699527.01
Total cost per age range 50+ is 7848380.55
The region with the highest paying insurance is southeast who paid a total of 5363689.76 
On average smokers where paying -143.72 more then non smokers in our sample data


# Expanding on our findings
Based on our investigation, one would have expected smokers to have been paying more on insure then non smokers

We will expand on our investigation by including other metrics in our dataset against smoking vs non

In [359]:
entries_charges = []
entries_smoking_vs_non = []
# Age range smoke vs non
ages_by_entry = []
# Genders smoking vs non
gender_by_entry = []
# Has children smoking vs non
# Going to keep the number of children for now
# instead of translating this into True / False
# Because the number of children may also show us
# some other unexpected insights
no_children_by_entry = []

def genderSmokingVsNon(entries_smoking_vs_non,gender_by_entry,entries_charges):
    byGender = list(zip(gender_by_entry,entries_smoking_vs_non,entries_charges))
    byGenderCount = {
        "male":{
            "yes":0,
            "no":0,
        },
        "female":{
            "yes":0,
            "no":0,
        }
    }
    for entry in byGender:
        byGenderCount[entry[0]][entry[1]] += entry[2]
    return byGenderCount

def withChildrenSmokingVsNon(entries_smoking_vs_non,no_children_by_entry,entries_charges):
    byChildren = list(zip(no_children_by_entry,entries_smoking_vs_non,entries_charges))
    byChildrenCount = {
        "children":{
            "yes":0,
            "no":0,
        },
        "no-children":{
            "yes":0,
            "no":0,
        }
    }
    for entry in byChildren:
        if entry[0] > 0:
            byChildrenCount["children"][entry[1]] += entry[2]
        else:
            byChildrenCount["no-children"][entry[1]] += entry[2]
    return byChildrenCount

def ageRangesSmokingVsNon(entries_smoking_vs_non,ages_by_entry,entries_charges):
    byAge = list(zip(ages_by_entry,entries_smoking_vs_non,entries_charges))
    byAgeCount = {
        "0 - 19": { "yes": 0, "no":0 },
        "20 - 29": { "yes": 0, "no":0 },
        "30 - 39": { "yes": 0, "no":0 },
        "40 - 49": { "yes": 0, "no":0 },
        "50+": { "yes": 0, "no":0 }
    }
    for entry in byAge:
        range = ""
        if(entry[0] > 0 and entry[0] <= 19): range = "0 - 19"
        elif(entry[0] > 20 and entry[0] <= 29): range = "20 - 29"
        elif(entry[0] > 30 and entry[0] <= 39): range = "30 - 39"
        elif(entry[0] > 40 and entry[0] <= 49): range = "40 - 49"
        else: range = "50+"

        byAgeCount[range][entry[1]] += entry[2]

    return byAgeCount

In [360]:
with open('insurance.csv') as insurance_file:
    insurance_data = csv.DictReader(insurance_file)
    for entry in insurance_data:
        # Create a list for each entry for smoking
        entries_smoking_vs_non.append(entry["smoker"])
        entries_charges.append(float(entry["charges"]))
        gender_by_entry.append(entry["sex"])
        no_children_by_entry.append(int(entry["children"]))
        ages_by_entry.append(int(entry["age"]))

In [361]:
chargesByGenderSmokingVsNon = genderSmokingVsNon(entries_smoking_vs_non,gender_by_entry,entries_charges)
print("Males who smoke paid {} more then those that don't".format(
    round(chargesByGenderSmokingVsNon["male"]['yes'] - chargesByGenderSmokingVsNon["male"]['no'],2)
))
print("Females who smoke paid {} more then those that don't".format(
    round(chargesByGenderSmokingVsNon["female"]['yes'] - chargesByGenderSmokingVsNon["male"]['no'],2)
))
chargesByHavingChildrenSmokingVsNon = withChildrenSmokingVsNon(entries_smoking_vs_non,no_children_by_entry,entries_charges)
print("People with children who smoke paid {} more then those who don't smoke".format(
    round(chargesByHavingChildrenSmokingVsNon["children"]['yes'] - chargesByHavingChildrenSmokingVsNon["children"]['no'],2)
))
print("People without children who smoke paid {} more then those who don't smoke".format(
    round(chargesByHavingChildrenSmokingVsNon["no-children"]['yes'] - chargesByHavingChildrenSmokingVsNon["no-children"]['no'],2)
))
chargesByAgeRangeSmokingVsNon = ageRangesSmokingVsNon(entries_smoking_vs_non,ages_by_entry,entries_charges)
for key in chargesByAgeRangeSmokingVsNon:
    print("Ages {} who smoke paid {} more then those who didn't".format(key,round(chargesByAgeRangeSmokingVsNon[key]['yes'] - chargesByAgeRangeSmokingVsNon[key]['no'],2)))


Males who smoke paid 1072594.1 more then those that don't
Females who smoke paid -653000.27 more then those that don't
People with children who smoke paid -302741.66 more then those who don't smoke
People without children who smoke paid 110443.71 more then those who don't smoke
Ages 0 - 19 who smoke paid 431607.06 more then those who didn't
Ages 20 - 29 who smoke paid 324691.16 more then those who didn't
Ages 30 - 39 who smoke paid 340780.23 more then those who didn't
Ages 40 - 49 who smoke paid 73490.07 more then those who didn't
Ages 50+ who smoke paid -1362866.46 more then those who didn't


# Findings
Females who smoke paid less then those who didn't
People with children who paid less then those without children
People who are older then 50 who smoke paid less then those who didn't