In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

In [2]:
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

In [3]:
# create engine to hawaii.sqlite
engine = create_engine("sqlite:///healthcare.sqlite")

In [4]:
# reflect an existing database into a new model
# reflect the tables
Base = automap_base()
Base.prepare(autoload_with=engine)

In [5]:
# View all of the classes that automap found
Base.classes.keys()

[]

In [3]:
# File to Load
health_care_file = "watson_healthcare_data.csv"

In [4]:
healthcare_data = pd.read_csv(health_care_file)
healthcare_data

Unnamed: 0,EmployeeID,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,...,RelationshipSatisfaction,StandardHours,Shift,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1313919,41,No,Travel_Rarely,1102,Cardiology,1,2,Life Sciences,1,...,1,80,0,8,0,1,6,4,0,5
1,1200302,49,No,Travel_Frequently,279,Maternity,8,1,Life Sciences,1,...,4,80,1,10,3,3,10,7,1,7
2,1060315,37,Yes,Travel_Rarely,1373,Maternity,2,2,Other,1,...,2,80,0,7,3,3,0,0,0,0
3,1272912,33,No,Travel_Frequently,1392,Maternity,3,4,Life Sciences,1,...,3,80,0,8,3,3,8,7,3,0
4,1414939,27,No,Travel_Rarely,591,Maternity,2,1,Medical,1,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1671,1117656,26,Yes,Travel_Rarely,471,Neurology,24,3,Technical Degree,1,...,2,80,0,1,3,1,1,0,0,0
1672,1152327,46,No,Travel_Rarely,1125,Cardiology,10,3,Marketing,1,...,3,80,1,15,3,3,3,2,1,2
1673,1812428,20,No,Travel_Rarely,959,Maternity,1,3,Life Sciences,1,...,4,80,0,1,0,4,1,0,0,0
1674,1812429,39,No,Travel_Rarely,466,Neurology,1,1,Life Sciences,1,...,3,80,1,21,3,3,21,6,11,8


In [5]:
# Check the data type
healthcare_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1676 entries, 0 to 1675
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   EmployeeID                1676 non-null   int64 
 1   Age                       1676 non-null   int64 
 2   Attrition                 1676 non-null   object
 3   BusinessTravel            1676 non-null   object
 4   DailyRate                 1676 non-null   int64 
 5   Department                1676 non-null   object
 6   DistanceFromHome          1676 non-null   int64 
 7   Education                 1676 non-null   int64 
 8   EducationField            1676 non-null   object
 9   EmployeeCount             1676 non-null   int64 
 10  EnvironmentSatisfaction   1676 non-null   int64 
 11  Gender                    1676 non-null   object
 12  HourlyRate                1676 non-null   int64 
 13  JobInvolvement            1676 non-null   int64 
 14  JobLevel                

In [6]:
# Get the columns 
healthcare_data.columns

Index(['EmployeeID', 'Age', 'Attrition', 'BusinessTravel', 'DailyRate',
       'Department', 'DistanceFromHome', 'Education', 'EducationField',
       'EmployeeCount', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'Shift',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [9]:
unique_departments = healthcare_data['Department'].unique()
departments = [department for department in unique_departments]
print(departments)

['Cardiology', 'Maternity', 'Neurology']


In [13]:
unique_age = healthcare_data['Age'].unique()
ages = [age for age in unique_age]
print(ages)
print(f"number of ages in the data: {len(ages)}")

[41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 22, 53, 24, 21, 42, 44, 46, 39, 43, 50, 26, 48, 55, 45, 56, 23, 51, 40, 54, 58, 20, 25, 19, 57, 52, 47, 18, 60]
number of ages in the data: 43


In [21]:
uniqueIncome = healthcare_data['HourlyRate'].unique()
print(len(uniqueIncome))

71


In [27]:
# Drop unwanted columns
healthcare_data_df = healthcare_data[['EmployeeID', 'Age', 'Attrition',
       'Department', 'DistanceFromHome','Gender', 'HourlyRate', 'JobSatisfaction',
       'MaritalStatus','TotalWorkingYears']]
healthcare_data_df

Unnamed: 0,EmployeeID,Age,Attrition,Department,DistanceFromHome,Gender,HourlyRate,JobSatisfaction,MaritalStatus,TotalWorkingYears
0,1313919,41,No,Cardiology,1,Female,94,4,Single,8
1,1200302,49,No,Maternity,8,Male,61,2,Married,10
2,1060315,37,Yes,Maternity,2,Male,92,3,Single,7
3,1272912,33,No,Maternity,3,Female,56,3,Married,8
4,1414939,27,No,Maternity,2,Male,40,2,Married,6
...,...,...,...,...,...,...,...,...,...,...
1671,1117656,26,Yes,Neurology,24,Male,66,4,Single,1
1672,1152327,46,No,Cardiology,10,Female,94,4,Married,15
1673,1812428,20,No,Maternity,1,Female,83,2,Single,1
1674,1812429,39,No,Neurology,1,Female,65,4,Married,21


In [28]:
healthcare_data_df.to_csv("healthcare_data_new.csv", index=False)

In [30]:
healthcare_data_df.to_json("healthcare_data.json")

In [13]:
sqlite3.connect("healthcare_db")

NameError: name 'sqlite3' is not defined