# U.S. Medical Insurance Costs

In [1]:
# Import CSV libray
import csv

In [2]:
# List to hold similar values
# Data stored into these list, extracted from the CSV file will be stored as a string representation of the info
ages = []
sex = []
bmi = []
num_children = []
smoker = []
region = []
costs = []

In [3]:
# Helper function to extract data from csv file
def extract_data(lst, csv_file, column_name):
    # Open CSV file
    with open(csv_file) as csv_data:
        # Create a dictionary from CSV data
        csv_dict = csv.DictReader(csv_data)
        # Appened CSV data into relevant list
        for row in csv_dict:
            lst.append(row[column_name])
    
    return lst
            

In [5]:
# Creation of a class to hold all patient data and allow for easier manipulation 
class PatientInfo:
    def __init__(self, age, sex, bmi, num_children, smoker, region, costs):
        self.age = age
        self.sex = sex
        self.bmi = bmi
        self.num_children = num_children
        self.smoker = smoker
        self.region = region
        self.costs = costs
        
    # This method will determine the avg age of the patients    
    def analyze_ages(self):
        total = 0
        for age in self.age:
            total += int(age)
        # Floor division is used here to determine the average age rounding down to the nearest integer
        avg_age = total // len(self.age)
        
        return "The average age of a patient is: {0} years old.".format(avg_age)
    
    # This method will determine the number of males and number of females in the dataset
    def analyze_sex(self):
        # Variables to hold the count for males and females
        males = 0
        females = 0
        # Loop thru the list of sexes and update the count variables
        for sex in self.sex:
            if sex == "male":
                males += 1
            if sex == "female":
                females += 1
        
        return "The number of male patients in the dataset is : {0} and the number of female patients in the dataset is: {1}.".format(males, females)
    
    # This method will determine the unique locations of the patients in the dataset
    # This method will also determine how many patients live in each unique location
    def analyze_locations(self):
        # Variable to hold the unique locations
        locations = []
        location_dictionary = {}
        # Loop thru the location dataset and determine the unique locations
        for location in self.region:
            if location not in locations:
                locations.append(location)
                location_dictionary[location] = 0
        # Loop thru the locations list and count the number of occurences of each location
        # Then add the number of occurences to the dictionary
        for location in self.region:
            location_dictionary[location] = self.region.count(location)
            
        # A string representation of the unique locations and patients in each locations
        for region, location in location_dictionary.items():
            print("The number of patients in the {0} region is {1}.".format(region, location))
        
        # Returns the location_dictionary with the number of patients in the unique location
        # return location_dictionary
    
    # This method will determine the average yearly cost of the patients
    def average_charges(self):
        total = 0
        for charge in self.costs:
            total += float(charge)
        avg_charge = round((total / len(self.costs)), 2)
        
        return "The average yearly charge per patient is: ${0} dollars.".format(avg_charge)
    
    # This method will return a dictionary object containing each unique patient info as a key-value pair
    # Each key will be a unique patient info and each value will be a list of the info
    def create_dictionary(self):
        self.patient_dictionary = {}
        self.patient_dictionary['Age'] = [int(age) for age in self.age]
        self.patient_dictionary['Sex'] = self.sex
        self.patient_dictionary['BMI'] = [float(bmi) for bmi in self.bmi]
        self.patient_dictionary['Number of Children'] = [int(child) for child in self.num_children]
        self.patient_dictionary['Smoker'] = self.smoker
        self.patient_dictionary['Region'] = self.region
        self.patient_dictionary['Yearly Cost'] = [round(float(cost),2) for cost in self.costs]
        
        return self.patient_dictionary

In [6]:
# Instance of PatientInfo class with dataset loaded into it
insurance_dataset = PatientInfo(ages, sex, bmi, num_children, smoker, region, costs)

In [7]:
# Test of Analyze Age Method
insurance_dataset.analyze_ages()

'The average age of a patient is: 39 years old.'

In [8]:
# Test of Analyze Sex Method
insurance_dataset.analyze_sex()

'The number of male patients in the dataset is : 676 and the number of female patients in the dataset is: 662.'

In [9]:
# Test of Analyze Location Method
insurance_dataset.analyze_locations()

The number of patients in the southwest region is 325.
The number of patients in the southeast region is 364.
The number of patients in the northwest region is 325.
The number of patients in the northeast region is 324.


In [10]:
# Test of Average Charge Method
insurance_dataset.average_charges()

'The average yearly charge per patient is: $13270.42 dollars.'

In [11]:
# Test of Create Dictionary Method
insurance_dataset.create_dictionary()


{'Age': [19,
  18,
  28,
  33,
  32,
  31,
  46,
  37,
  37,
  60,
  25,
  62,
  23,
  56,
  27,
  19,
  52,
  23,
  56,
  30,
  60,
  30,
  18,
  34,
  37,
  59,
  63,
  55,
  23,
  31,
  22,
  18,
  19,
  63,
  28,
  19,
  62,
  26,
  35,
  60,
  24,
  31,
  41,
  37,
  38,
  55,
  18,
  28,
  60,
  36,
  18,
  21,
  48,
  36,
  40,
  58,
  58,
  18,
  53,
  34,
  43,
  25,
  64,
  28,
  20,
  19,
  61,
  40,
  40,
  28,
  27,
  31,
  53,
  58,
  44,
  57,
  29,
  21,
  22,
  41,
  31,
  45,
  22,
  48,
  37,
  45,
  57,
  56,
  46,
  55,
  21,
  53,
  59,
  35,
  64,
  28,
  54,
  55,
  56,
  38,
  41,
  30,
  18,
  61,
  34,
  20,
  19,
  26,
  29,
  63,
  54,
  55,
  37,
  21,
  52,
  60,
  58,
  29,
  49,
  37,
  44,
  18,
  20,
  44,
  47,
  26,
  19,
  52,
  32,
  38,
  59,
  61,
  53,
  19,
  20,
  22,
  19,
  22,
  54,
  22,
  34,
  26,
  34,
  29,
  30,
  29,
  46,
  51,
  53,
  19,
  35,
  48,
  32,
  42,
  40,
  44,
  48,
  18,
  30,
  50,
  42,
  18,
  54,
  32,
  37,
  4

In [12]:
# This cell will be used to calculate the cost difference between a smoker and a non smoker in the data set
# Extract data into variables
smoker_data = insurance_dataset.patient_dictionary['Smoker']
costs_data = insurance_dataset.patient_dictionary['Yearly Cost']

# Combine data into a list of tuples 
smoker_to_costs = list(zip(smoker_data, costs_data))

# List to hold data
smoke_yes = []
smoke_no = []

for data in smoker_to_costs:
    if data[0] == 'yes':
        smoke_yes.append(data)
    if data[0] == 'no':
        smoke_no.append(data)
# This function will only be used to extract the average cost from the list       
def average_cost(lst):
    total = 0
    for value in lst:
        # Because we're accessing tuples
        total += value[1]
    return round((total / len(lst)),2)

smoker_avg_cost = average_cost(smoke_yes)
non_smoker_avg_cost = average_cost(smoke_no)
difference_in_cost = smoker_avg_cost - non_smoker_avg_cost

print("The average cost for a smoker is: ${0} dollars.".format(smoker_avg_cost))
print("The average cost for a non-smoker is: ${0} dollars.".format(non_smoker_avg_cost))
print("The difference in cost between a smoker and a non-smoker is: ${0} dollars.".format(difference_in_cost))

The average cost for a smoker is: $32050.23 dollars.
The average cost for a non-smoker is: $8434.27 dollars.
The difference in cost between a smoker and a non-smoker is: $23615.96 dollars.


In [20]:
# This cell will be used to determine the average age of a patient with at least one child as well as the cost of insurance

children_data = insurance_dataset.patient_dictionary['Number of Children']
age_data = insurance_dataset.patient_dictionary['Age']

age_children_cost = list(zip(age_data,children_data,costs_data))

has_kids = []
for data in age_children_cost:
    # The tuple stores the number of children in the 1st index
    if data[1] > 0:
        has_kids.append(data)
        
total_age_with_kids = 0
total_cost_with_kids = 0
for data in has_kids:
    total_cost_with_kids += data[2]
    total_age_with_kids += data[0]
    
average_age_with_kids = round(total_age_with_kids / len(has_kids))
average_cost_with_kids = round(total_cost_with_kids / len(has_kids))
    
print("The average age of a patient with at least one child is: {0} years old and the average cost for a patient with children is: ${1} dollars.".format(average_age_with_kids, average_cost_with_kids))

The average age of a patient with at least one child is: 40 years old and the average cost for a patient with children is: $13950 dollars.
