# U.S. Medical Insurance Costs

# Loading data

In [1]:
import csv

age = []
sex = []
bmi = []
children = []
smoker = []
region = []
charges = []

with open("insurance.csv") as data:
  data_dict = csv.DictReader(data)
  for row in data_dict:
    age.append(int(row["age"]))
    sex.append(row["sex"])
    bmi.append(float(row["bmi"]))
    children.append(int(row["children"]))
    smoker.append(row["smoker"])
    region.append(row["region"])
    charges.append(float(row["charges"]))
    
print(age[0:5], sex[0:5], bmi[0:5], children[0:5])
print(smoker[0:5], region[0:5], charges[0:5])

[19, 18, 28, 33, 32] ['female', 'male', 'male', 'male', 'male'] [27.9, 33.77, 33.0, 22.705, 28.88] [0, 1, 3, 0, 0]
['yes', 'no', 'no', 'no', 'no'] ['southwest', 'southeast', 'southeast', 'northwest', 'northwest'] [16884.924, 1725.5523, 4449.462, 21984.47061, 3866.8552]


In [2]:
#representing sex as 0 (female) or 1 (male)
sex_num = [0 if person == 'female' else 1 if person == 'male' else None for person in sex]

print(sex_num[0:5])

#representing smoking status as 0 (not a smoker) or 1 (smoker)
smoker_num = [0 if ans == 'no' else 1 if ans == 'yes' else None for ans in smoker]

print(smoker_num[0:5])

[0, 1, 1, 1, 1]
[1, 0, 0, 0, 0]


In [3]:
#list with dictionaries for every record
datapoints = []
for i in range(len(age)):
    record = {
        "id" : i+1,
        "age" : age[i],
        "sex" : sex_num[i],
        "bmi" : bmi[i], 
        "children" : children[i],
        "smoker" : smoker_num[i],
        "region" : region[i],
        "charges" : charges[i]
    }
    datapoints.append(record)

print(datapoints[0:2])

[{'id': 1, 'age': 19, 'sex': 0, 'bmi': 27.9, 'children': 0, 'smoker': 1, 'region': 'southwest', 'charges': 16884.924}, {'id': 2, 'age': 18, 'sex': 1, 'bmi': 33.77, 'children': 1, 'smoker': 0, 'region': 'southeast', 'charges': 1725.5523}]


# Classes and helper functions
## Record Class
This class will be used to represent all info about a person (a record) in an object.

In [4]:
class Record():
    
    def __init__(self, age, sex, bmi, children, smoker, region, charges):
        self.age = age
        self.sex = sex #numeric format
        self.bmi = bmi
        self.children = children
        self.smoker = smoker #numeric format
        self.region = region
        self.charges = charges  


## Population Class
This class will represent populations (groups) of people as an object

In [58]:
class Group():
    
    def __init__(self, name):
        self.size = 0
        self.group = []
        self.name = name
        
    def __repr__(self):
        return f"group '{self.name}'"
    
    #adds a record to a group
    def add_person(self, record):
        self.size += 1
        self.group.append(record)
        
    def average_age(self, roundto = 3):
        total = 0
        for record in self.group:
            total += record.age
        return round(total / self.size, roundto)
    
    def fraction_of_men(self, roundto = 3):
        total = 0
        for record in self.group:
            total += record.sex
        return round(total / self.size, roundto)
    
    def fraction_of_women(self, roundto = 3):
        result = 1 - self.fraction_of_men(roundto)
        return round(result, roundto)
    
    def average_bmi(self, roundto = 3):
        total = 0
        for record in self.group:
            total += record.bmi
        return round(total / self.size, roundto) 
    
    def average_children(self, roundto = 3):
        total = 0
        for record in self.group:
            total += record.children
        return round(total / self.size, roundto)
    
    def fraction_of_smokers(self, roundto = 3):
        total = 0
        for record in self.group:
            total += record.smoker
        return round(total / self.size, roundto)
    
    def fraction_of_nonsmokers(self, roundto = 3):
        result = 1 - self.fraction_of_smokers(roundto)
        return round(result, roundto)
    
    def fraction_per_region(self, one_region, roundto = 3):
        total = 0
        for record in self.group:
            if record.region == one_region:
                total += 1
        return round(total / self.size, roundto)
    
    def average_charges(self, roundto = 4):
        total = 0
        for record in self.group:
            total += record.charges
        return round(total / self.size, roundto) 

## Function compare_groups
This function allows to compare a list of group averages for different variables (or parameters).

It compares groups with regard to 1 parameter, which could be:
1. "age" - prints average ages (for a list of groups),
2. "sex" - prints fraction of men, 
3. "bmi" - prints average bmi,
4. "children" - prints average amount of children per person, 
5. "smoker" - prints fraction of smokers,
6. region_name from the global variable region - prints fraction of people living in "region_name",
7. "smoker" - prints fraction of smokers,
6. "charges" - prints average charges.

In [73]:
def compare_groups(group_list,parameter):
    
    global region
    
    result = []
    value = None

    for group in group_list:
        
        if parameter == "age":
            value = group.average_age()
            print(f"The {group} has average age {value}.")
        
        elif parameter == "sex":
            value = group.fraction_of_men() * 100
            print(f"The {group} has {value}% of men.")
        
        elif parameter == "bmi":
            value = group.average_bmi()
            print(f"The {group} has average BMI of {value}.")
        
        elif parameter == "children":
            value = group.average_children()
            if float(value) == 1.0:
                ending = "child"
            else:
                ending = "children"
            print(f"The {group} has on average {value} {ending}.")
        
        elif parameter == "smoker":
            value = group.fraction_of_smokers() * 100
            print(f"The {group} has {value}% of smokers.")
    
        elif parameter in set(region):
            one_region = parameter
            value = group.fraction_per_region(one_region) * 100
            value = round(value,3) #internal rounding in the class method somehow not always works..
            print(f"The {group} has {value}% of people in {one_region} region.")
        
        elif parameter == "charges":
            value = group.average_charges()
            print(f"The {group} has average charges of {value}.")
            
        result.append((group, value))
       
    result.sort(key=lambda y: y[1])
    return result

In [49]:
#testing creation of Groups and group methods

group1 = Group("first ten")

for datapoint in datapoints[0:10]:
    record = Record(
        datapoint["age"],
        datapoint["sex"],
        datapoint["bmi"],
        datapoint["children"],
        datapoint["smoker"],
        datapoint["region"],
        datapoint["charges"])
    group1.add_person(record)

group2 = Group("11-20")    
    
for datapoint in datapoints[10:20]:
    record = Record(
        datapoint["age"],
        datapoint["sex"],
        datapoint["bmi"],
        datapoint["children"],
        datapoint["smoker"],
        datapoint["region"],
        datapoint["charges"])
    group2.add_person(record)
    
#tested all methods and they worked as expected
#example of a test
print(group1.fraction_per_region('southeast'))
region[0:10]

0.4


['southwest',
 'southeast',
 'southeast',
 'northwest',
 'northwest',
 'southeast',
 'southeast',
 'northwest',
 'northeast',
 'northwest']

In [50]:
#testing the comparison function showed expected behaviour, for example:

print(compare_groups([group1,group2],"children")) #function prints a sentence per group and returns sorted list
print(children[0:10])
print(children[10:20], "\n")

print(compare_groups([group1,group2],"southeast"))
print(region[0:10])
print(region[10:20],  "\n")

print(compare_groups([group1,group2],"sex"))
print(sex[0:10])
print(sex[10:20],  "\n")

print(compare_groups([group1,group2],"charges"))
print(charges[0:10])
print(charges[10:20])

The group 'first ten' has on average 1.0 child.
The group '11-20' has on average 0.2 children.
[(group '11-20', 0.2), (group 'first ten', 1.0)]
[0, 1, 3, 0, 0, 0, 1, 3, 2, 0]
[0, 0, 0, 0, 0, 1, 1, 0, 0, 0] 

The group 'first ten' has 40.0% of people in southeast region.
The group '11-20' has 30.0% of people in southeast region.
[(group '11-20', 30.0), (group 'first ten', 40.0)]
['southwest', 'southeast', 'southeast', 'northwest', 'northwest', 'southeast', 'southeast', 'northwest', 'northeast', 'northwest']
['northeast', 'southeast', 'southwest', 'southeast', 'southeast', 'southwest', 'northeast', 'northeast', 'southwest', 'southwest'] 

group 'first ten' has 50.0% of men.
group '11-20' has 70.0% of men.
[(group 'first ten', 50.0), (group '11-20', 70.0)]
['female', 'male', 'male', 'male', 'male', 'female', 'female', 'female', 'male', 'female']
['male', 'female', 'male', 'female', 'male', 'male', 'female', 'male', 'male', 'male'] 

group 'first ten' has average charges of 10351.9529.
gro

# Exploring the data

## Smokers vs. nonsmokers
### Compare their average insurance costs, bmi, children amount

In [60]:
smokers = Group("smokers")
nonsmokers = Group("nonsmokers")    

for datapoint in datapoints:
    record = Record(
        datapoint["age"],
        datapoint["sex"],
        datapoint["bmi"],
        datapoint["children"],
        datapoint["smoker"],
        datapoint["region"],
        datapoint["charges"])
    if record.smoker == 1:
        smokers.add_person(record)
    elif record.smoker == 0:
        nonsmokers.add_person(record)

#check that every record is either a smoker or not, no missing data
print(len(datapoints), smokers.size, nonsmokers.size)

#sanity check:
print(compare_groups([smokers,nonsmokers],"smoker"))

1338 274 1064
group 'smokers' has 100.0% of smokers.
group 'nonsmokers' has 0.0% of smokers.
[(group 'nonsmokers', 0.0), (group 'smokers', 100.0)]


In [61]:
print(compare_groups([smokers,nonsmokers],"charges")) #smokers pay much more

group 'smokers' has average charges of 32050.2318.
group 'nonsmokers' has average charges of 8434.2683.
[(group 'nonsmokers', 8434.2683), (group 'smokers', 32050.2318)]


In [62]:
print(compare_groups([smokers,nonsmokers],"bmi")) #smokers and nonsmokers have similar average bmi

group 'smokers' has average BMI of 30.708.
group 'nonsmokers' has average BMI of 30.652.
[(group 'nonsmokers', 30.652), (group 'smokers', 30.708)]


In [63]:
print(compare_groups([smokers,nonsmokers],"children")) #smokers and nonsmokers have similar average children amount

The group 'smokers' has on average 1.113 children.
The group 'nonsmokers' has on average 1.09 children.
[(group 'nonsmokers', 1.09), (group 'smokers', 1.113)]


## Different BMI categories
### Compare their average insurance costs, age, smoking status, children amount, regions

In [74]:
underweight = Group("underweight")
normal = Group("normal")    
overweight = Group("overweight")
obese = Group("obese")

for datapoint in datapoints:
    record = Record(
        datapoint["age"],
        datapoint["sex"],
        datapoint["bmi"],
        datapoint["children"],
        datapoint["smoker"],
        datapoint["region"],
        datapoint["charges"])
    if record.bmi < 18.5:
        underweight.add_person(record)
    elif record.bmi <= 24.9:
        normal.add_person(record)
    elif record.bmi <= 29.9:
        overweight.add_person(record)
    elif record.bmi > 29.9:
        obese.add_person(record)
    else:
        print("Sth is wrong..")
        
#check that every record falls into one of the categories (groups), no missing data
print(len(datapoints), underweight.size, normal.size, overweight.size, obese.size)

#sanity check:
print(compare_groups([underweight, normal, overweight, obese],"bmi"))

1338 20 222 380 716
group 'underweight' has average BMI of 17.566.
group 'normal' has average BMI of 22.584.
group 'overweight' has average BMI of 27.527.
group 'obese' has average BMI of 35.199.
[(group 'underweight', 17.566), (group 'normal', 22.584), (group 'overweight', 27.527), (group 'obese', 35.199)]


In [75]:
print(compare_groups([underweight, normal, overweight, obese],"charges")) # smaller bmi groups -> smaller charges

group 'underweight' has average charges of 8852.2006.
group 'normal' has average charges of 10379.4997.
group 'overweight' has average charges of 11006.81.
group 'obese' has average charges of 15491.5422.
[(group 'underweight', 8852.2006), (group 'normal', 10379.4997), (group 'overweight', 11006.81), (group 'obese', 15491.5422)]


In [76]:
print(compare_groups([underweight, normal, overweight, obese],"age")) # smaller bmi groups -> smaller age

group 'underweight' has average age 32.35.
group 'normal' has average age 36.748.
group 'overweight' has average age 38.874.
group 'obese' has average age 40.338.
[(group 'underweight', 32.35), (group 'normal', 36.748), (group 'overweight', 38.874), (group 'obese', 40.338)]


In [77]:
print(compare_groups([underweight, normal, overweight, obese],"smoker")) 
# biggest % of smokers in the underweight category
# lowest % of smokers in the overweight category, second lowest in obese category.

group 'underweight' has 25.0% of smokers.
group 'normal' has 22.5% of smokers.
group 'overweight' has 18.9% of smokers.
group 'obese' has 20.5% of smokers.
[(group 'overweight', 18.9), (group 'obese', 20.5), (group 'normal', 22.5), (group 'underweight', 25.0)]


In [78]:
print(compare_groups([underweight, normal, overweight, obese],"children")) 
# biggest average children amount per person is in the normal BMI category, next is the obese category

The group 'underweight' has on average 1.05 children.
The group 'normal' has on average 1.117 children.
The group 'overweight' has on average 1.068 children.
The group 'obese' has on average 1.103 children.
[(group 'underweight', 1.05), (group 'overweight', 1.068), (group 'obese', 1.103), (group 'normal', 1.117)]


In [79]:
for a_region in set(region):
    print(compare_groups([underweight, normal, overweight, obese], a_region))
    print("")

# Among underweight people most (50%) are from northeast and no one is from southeast.
# Among obese people most (34.8%) are from southeast.

The group 'underweight' has 15.0% of people in southwest region.
The group 'normal' has 21.6% of people in southwest region.
The group 'overweight' has 26.6% of people in southwest region.
The group 'obese' has 24.2% of people in southwest region.
[(group 'underweight', 15.0), (group 'normal', 21.6), (group 'obese', 24.2), (group 'overweight', 26.6)]

The group 'underweight' has 0.0% of people in southeast region.
The group 'normal' has 18.0% of people in southeast region.
The group 'overweight' has 19.7% of people in southeast region.
The group 'obese' has 34.8% of people in southeast region.
[(group 'underweight', 0.0), (group 'normal', 18.0), (group 'overweight', 19.7), (group 'obese', 34.8)]

The group 'underweight' has 35.0% of people in northwest region.
The group 'normal' has 27.9% of people in northwest region.
The group 'overweight' has 27.9% of people in northwest region.
The group 'obese' has 20.9% of people in northwest region.
[(group 'obese', 20.9), (group 'normal', 27.9)