# Introduction to ETL operations with Pandad and SQLAlchemy


Install the necessary dependencies; pandas, pscopg2, and sqlalchemy
  

In [None]:
pip install pandas  #Python library used for data manipulation and analysis.

In [None]:
pip install psycopg2  #DBAPI, Python postgreSQL database adapter driver to for sqlachemy

In [None]:
pip install sqlalchemy  #SQLAlchemy is the Python SQL toolkit and Object Relational Mapper that gives application developers the full power and flexibility of SQL


Import the dependencies


In [35]:
import pandas as pd
from sqlalchemy import create_engine
import datetime

Using Pandas read the project csv file as <mark>df_1</mark>

In [None]:
df_1 = pd.read_csv('weight-height-updated.csv')

In [22]:
df_1.head()

Unnamed: 0,Gender,Height,Weight,Born_Year
0,Male,73.847017,241.893563,1967.0
1,Male,68.781904,162.310473,1977.0
2,Male,74.110105,212.740856,1991.0
3,Male,71.730978,220.04247,1998.0
4,Male,69.881796,206.349801,1985.0


In [15]:
df_1.shape  #Display the number of rows and columns

(10000, 4)

In [26]:
df_1.isnull().sum() #Checks the total count of null values.

Gender       0
Height       0
Weight       0
Born_Year    0
dtype: int64

In [16]:
df_1.dtypes  #Checks the data type for each column.

Gender        object
Height       float64
Weight       float64
Born_Year    float64
dtype: object

In [None]:
df_1.describe()  #Display descriptive statistics.

In [19]:
cv = (32.11/161.44)*100
cv

19.889742319127848

In [20]:
c_v1 = (11.44/1985.36)*100 #cv<10%, therefore can use mean to fill nulls
c_v1

0.5762179151388162

In [24]:
df_1['Born_Year'] = df_1['Born_Year'].fillna(df_1['Born_Year'].mode()[0])  #Calculated column.

In [25]:
df_1.isnull().sum()

Gender       0
Height       0
Weight       0
Born_Year    0
dtype: int64

In [None]:
#Drop
#df_1.dropna() #drops rows with missing values
#df_1.dropna(axis=1) #drops columns with missing values

In [47]:
df_1['Born_Year'] = df_1['Born_Year'].astype(int) #Change data type for the Born Year column.
df_1.dtypes

Gender           object
Height          float64
Weight          float64
Born_Year         int64
Born_year         int64
Current_Year      int64
Age             float64
bmi             float64
bmi_category     object
dtype: object

In [31]:
df_1.head()

Unnamed: 0,Gender,Height,Weight,Born_Year,Born_year
0,Male,73.847017,241.893563,1967.0,1967
1,Male,68.781904,162.310473,1977.0,1977
2,Male,74.110105,212.740856,1991.0,1991
3,Male,71.730978,220.04247,1998.0,1998
4,Male,69.881796,206.349801,1985.0,1985


In [52]:
df_1['BMI'] = (df_1['Weight'] / (df_1['Height'])**2) * 703  #Calculated column for body mass index.
df_1.head()

Unnamed: 0,Gender,Height,Weight,Born_Year,Current_Year,Age,BMI
0,Male,73.847017,241.893563,1967,2025,58.0,31.1827
1,Male,68.781904,162.310473,1977,2025,48.0,24.118677
2,Male,74.110105,212.740856,1991,2025,34.0,27.230233
3,Male,71.730978,220.04247,1998,2025,27.0,30.064108
4,Male,69.881796,206.349801,1985,2025,40.0,29.705117


In [37]:
df_1['Current_Year'] = datetime.datetime.now().  #Calculated column for Current Year.
df_1.head()

Unnamed: 0,Gender,Height,Weight,Born_Year,Born_year,Bmi,Current_Year
0,Male,73.847017,241.893563,1967.0,1967,31.1827,2025
1,Male,68.781904,162.310473,1977.0,1977,24.118677,2025
2,Male,74.110105,212.740856,1991.0,1991,27.230233,2025
3,Male,71.730978,220.04247,1998.0,1998,30.064108,2025
4,Male,69.881796,206.349801,1985.0,1985,29.705117,2025


In [38]:
df_1['Age'] = df_1['Current_Year'] - df_1['Born_Year']  #Calculated column for Age.
df_1.head()

Unnamed: 0,Gender,Height,Weight,Born_Year,Born_year,Bmi,Current_Year,Age
0,Male,73.847017,241.893563,1967.0,1967,31.1827,2025,58.0
1,Male,68.781904,162.310473,1977.0,1977,24.118677,2025,48.0
2,Male,74.110105,212.740856,1991.0,1991,27.230233,2025,34.0
3,Male,71.730978,220.04247,1998.0,1998,30.064108,2025,27.0
4,Male,69.881796,206.349801,1985.0,1985,29.705117,2025,40.0


In [53]:
#Function that returns bmi category for each value in the BMI column.

def bmi_category(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 24.9:
        return "Normal Weight"
    elif bmi < 29.9:
        return "Overweight"
    elif bmi < 34.9:
        return "Obesity Class I"
    elif bmi <= 39.9:
        return "Obesity Class II"
    else:
        return "Obesity Class III"

df_1['BMI_Category'] = df_1['BMI'].apply(bmi_category)  #Call the function for each value of BMIs
df_1.head()

Unnamed: 0,Gender,Height,Weight,Born_Year,Current_Year,Age,BMI,BMI_Category
0,Male,73.847017,241.893563,1967,2025,58.0,31.1827,Obesity Class I
1,Male,68.781904,162.310473,1977,2025,48.0,24.118677,Normal Weight
2,Male,74.110105,212.740856,1991,2025,34.0,27.230233,Overweight
3,Male,71.730978,220.04247,1998,2025,27.0,30.064108,Obesity Class I
4,Male,69.881796,206.349801,1985,2025,40.0,29.705117,Overweight


In [48]:
df_1.dtypes

Gender           object
Height          float64
Weight          float64
Born_Year         int64
Born_year         int64
Current_Year      int64
Age             float64
bmi             float64
bmi_category     object
dtype: object

In [49]:
df_1.head()

Unnamed: 0,Gender,Height,Weight,Born_Year,Born_year,Current_Year,Age,bmi,bmi_category
0,Male,73.847017,241.893563,1967,1967,2025,58.0,31.1827,Obesity Class I
1,Male,68.781904,162.310473,1977,1977,2025,48.0,24.118677,Normal Weight
2,Male,74.110105,212.740856,1991,1991,2025,34.0,27.230233,Overweight
3,Male,71.730978,220.04247,1998,1998,2025,27.0,30.064108,Obesity Class I
4,Male,69.881796,206.349801,1985,1985,2025,40.0,29.705117,Overweight


In [54]:
df_1.head()

Unnamed: 0,Gender,Height,Weight,Born_Year,Current_Year,Age,BMI,BMI_Category
0,Male,73.847017,241.893563,1967,2025,58.0,31.1827,Obesity Class I
1,Male,68.781904,162.310473,1977,2025,48.0,24.118677,Normal Weight
2,Male,74.110105,212.740856,1991,2025,34.0,27.230233,Overweight
3,Male,71.730978,220.04247,1998,2025,27.0,30.064108,Obesity Class I
4,Male,69.881796,206.349801,1985,2025,40.0,29.705117,Overweight
