# Cloud ETL - Cervical Cancer Risk

In [2]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.2'
spark_version = 'spark-3.3.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install --only-upgrade openjdk-11-jdk-headless -qq > /dev/null
!wget -q -c http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Get:

In [3]:
!wget -c https://jdbc.postgresql.org/download/postgresql-42.2.9.jar


--2022-09-25 05:38:32--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2022-09-25 05:38:33 (10.4 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Cloud-ETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [5]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url="https://myawsbucket-092422.s3.us-west-1.amazonaws.com/kag_risk_factors_cervical_cancer.csv"
spark.sparkContext.addFile(url)
risk_factors_df = spark.read.csv(SparkFiles.get("kag_risk_factors_cervical_cancer.csv"), sep=",", header=True, inferSchema=True)

# Show DataFrame
risk_factors_df.show()

+---+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+---+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+--------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
|Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)|IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital herpe

# Count the number or records (rows) in the dataset

In [6]:
# Count the number of records (rows) in the dataset
print(risk_factors_df.count())



858


# Transform dataset to fit the tables in the schema file

In [7]:
risk_factors_df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Number of sexual partners: string (nullable = true)
 |-- First sexual intercourse: string (nullable = true)
 |-- Num of pregnancies: string (nullable = true)
 |-- Smokes: string (nullable = true)
 |-- Smokes (years): string (nullable = true)
 |-- Smokes (packs/year): string (nullable = true)
 |-- Hormonal Contraceptives: string (nullable = true)
 |-- Hormonal Contraceptives (years): string (nullable = true)
 |-- IUD: string (nullable = true)
 |-- IUD (years): string (nullable = true)
 |-- STDs: string (nullable = true)
 |-- STDs (number): string (nullable = true)
 |-- STDs:condylomatosis: string (nullable = true)
 |-- STDs:cervical condylomatosis: string (nullable = true)
 |-- STDs:vaginal condylomatosis: string (nullable = true)
 |-- STDs:vulvo-perineal condylomatosis: string (nullable = true)
 |-- STDs:syphilis: string (nullable = true)
 |-- STDs:pelvic inflammatory disease: string (nullable = true)
 |-- STDs:genital herpes: string (nulla

In [8]:
risk_factors_df.columns

['Age',
 'Number of sexual partners',
 'First sexual intercourse',
 'Num of pregnancies',
 'Smokes',
 'Smokes (years)',
 'Smokes (packs/year)',
 'Hormonal Contraceptives',
 'Hormonal Contraceptives (years)',
 'IUD',
 'IUD (years)',
 'STDs',
 'STDs (number)',
 'STDs:condylomatosis',
 'STDs:cervical condylomatosis',
 'STDs:vaginal condylomatosis',
 'STDs:vulvo-perineal condylomatosis',
 'STDs:syphilis',
 'STDs:pelvic inflammatory disease',
 'STDs:genital herpes',
 'STDs:molluscum contagiosum',
 'STDs:AIDS',
 'STDs:HIV',
 'STDs:Hepatitis B',
 'STDs:HPV',
 'STDs: Number of diagnosis',
 'STDs: Time since first diagnosis',
 'STDs: Time since last diagnosis',
 'Dx:Cancer',
 'Dx:CIN',
 'Dx:HPV',
 'Dx',
 'Hinselmann',
 'Schiller',
 'Citology',
 'Biopsy']

In [9]:
risk_factors_df.describe()

DataFrame[summary: string, Age: string, Number of sexual partners: string, First sexual intercourse: string, Num of pregnancies: string, Smokes: string, Smokes (years): string, Smokes (packs/year): string, Hormonal Contraceptives: string, Hormonal Contraceptives (years): string, IUD: string, IUD (years): string, STDs: string, STDs (number): string, STDs:condylomatosis: string, STDs:cervical condylomatosis: string, STDs:vaginal condylomatosis: string, STDs:vulvo-perineal condylomatosis: string, STDs:syphilis: string, STDs:pelvic inflammatory disease: string, STDs:genital herpes: string, STDs:molluscum contagiosum: string, STDs:AIDS: string, STDs:HIV: string, STDs:Hepatitis B: string, STDs:HPV: string, STDs: Number of diagnosis: string, STDs: Time since first diagnosis: string, STDs: Time since last diagnosis: string, Dx:Cancer: string, Dx:CIN: string, Dx:HPV: string, Dx: string, Hinselmann: string, Schiller: string, Citology: string, Biopsy: string]

In [10]:
# Import struct fields that we can use (this is to import data types)
from pyspark.sql.types import StructField, StringType, IntegerType, FloatType, StructType


In [11]:
# Next we need to create the list of struct fields
# this is the syntax to fix the string to float
schema = [StructField("Age", IntegerType(), True), 
          StructField("Number of sexual partners", FloatType(), True),
          StructField("First sexual intercourse", FloatType(), True),
          StructField("Num of pregnancies", FloatType(), True),
          StructField("Smokes", FloatType(), True),
          StructField("Smokes (years)", FloatType(), True),
          StructField("Smokes (packs/year)", FloatType(), True),
          StructField("Hormonal Contraceptives", FloatType(), True),
          StructField("Hormonal Contraceptives (years)", FloatType(), True),
          StructField("IUD", FloatType(), True),
          StructField("IUD (years)", FloatType(), True),
          StructField("STDs", FloatType(), True),
          StructField("STDs (number)", FloatType(), True),
          StructField("STDs:condylomatosis", FloatType(), True),
          StructField("STDs:cervical condylomatosis", FloatType(), True),
          StructField("STDs:vaginal condylomatosis", FloatType(), True),
          StructField("STDs:vulvo-perineal condylomatosis", FloatType(), True),
          StructField("STDs:syphilis", FloatType(), True),
          StructField("STDs:pelvic inflammatory disease", FloatType(), True),
          StructField("STDs:genital herpes", FloatType(), True),
          StructField("STDs:molluscum contagiosum", FloatType(), True),
          StructField("STDs:AIDS", FloatType(), True),
          StructField("STDs:HIV", FloatType(), True),
          StructField("STDs:Hepatitis B", FloatType(), True),
          StructField("STDs:HPV", FloatType(), True),
          StructField("STDs: Number of diagnosis", IntegerType(), True),
          StructField("STDs: Time since first diagnosis", FloatType(), True),
          StructField("STDs: Time since last diagnosis", FloatType(), True),
          StructField("Dx:Cancer", IntegerType(), True),
          StructField("Dx:CIN", IntegerType(), True),
          StructField("Dx:HPV", IntegerType(), True),
          StructField("Dx", IntegerType(), True),
          StructField("Hinselmann", IntegerType(), True),
          StructField("Schiller", IntegerType(), True),
          StructField("Citology", IntegerType(), True),
          StructField("Biopsy", IntegerType(), True),
          ]
schema


[StructField('Age', IntegerType(), True),
 StructField('Number of sexual partners', FloatType(), True),
 StructField('First sexual intercourse', FloatType(), True),
 StructField('Num of pregnancies', FloatType(), True),
 StructField('Smokes', FloatType(), True),
 StructField('Smokes (years)', FloatType(), True),
 StructField('Smokes (packs/year)', FloatType(), True),
 StructField('Hormonal Contraceptives', FloatType(), True),
 StructField('Hormonal Contraceptives (years)', FloatType(), True),
 StructField('IUD', FloatType(), True),
 StructField('IUD (years)', FloatType(), True),
 StructField('STDs', FloatType(), True),
 StructField('STDs (number)', FloatType(), True),
 StructField('STDs:condylomatosis', FloatType(), True),
 StructField('STDs:cervical condylomatosis', FloatType(), True),
 StructField('STDs:vaginal condylomatosis', FloatType(), True),
 StructField('STDs:vulvo-perineal condylomatosis', FloatType(), True),
 StructField('STDs:syphilis', FloatType(), True),
 StructField('STD

In [44]:
final = StructType(fields=schema)
final

StructType([StructField('Age', IntegerType(), True), StructField('Number of sexual partners', FloatType(), True), StructField('First sexual intercourse', FloatType(), True), StructField('Num of pregnancies', FloatType(), True), StructField('Smokes', FloatType(), True), StructField('Smokes (years)', FloatType(), True), StructField('Smokes (packs/year)', FloatType(), True), StructField('Hormonal Contraceptives', FloatType(), True), StructField('Hormonal Contraceptives (years)', FloatType(), True), StructField('IUD', FloatType(), True), StructField('IUD (years)', FloatType(), True), StructField('STDs', FloatType(), True), StructField('STDs (number)', FloatType(), True), StructField('STDs:condylomatosis', FloatType(), True), StructField('STDs:cervical condylomatosis', FloatType(), True), StructField('STDs:vaginal condylomatosis', FloatType(), True), StructField('STDs:vulvo-perineal condylomatosis', FloatType(), True), StructField('STDs:syphilis', FloatType(), True), StructField('STDs:pelvi

In [45]:
dataframe = spark.read.csv(SparkFiles.get("kag_risk_factors_cervical_cancer.csv"), schema=final, sep=",", header=True)
dataframe.show()

+---+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+----+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+--------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
|Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)| IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital her

In [46]:
dataframe.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Number of sexual partners: float (nullable = true)
 |-- First sexual intercourse: float (nullable = true)
 |-- Num of pregnancies: float (nullable = true)
 |-- Smokes: float (nullable = true)
 |-- Smokes (years): float (nullable = true)
 |-- Smokes (packs/year): float (nullable = true)
 |-- Hormonal Contraceptives: float (nullable = true)
 |-- Hormonal Contraceptives (years): float (nullable = true)
 |-- IUD: float (nullable = true)
 |-- IUD (years): float (nullable = true)
 |-- STDs: float (nullable = true)
 |-- STDs (number): float (nullable = true)
 |-- STDs:condylomatosis: float (nullable = true)
 |-- STDs:cervical condylomatosis: float (nullable = true)
 |-- STDs:vaginal condylomatosis: float (nullable = true)
 |-- STDs:vulvo-perineal condylomatosis: float (nullable = true)
 |-- STDs:syphilis: float (nullable = true)
 |-- STDs:pelvic inflammatory disease: float (nullable = true)
 |-- STDs:genital herpes: float (nullable = true)
 |-- ST

In [50]:
dataframe.columns

['Age',
 'Number of sexual partners',
 'First sexual intercourse',
 'Num of pregnancies',
 'Smokes',
 'Smokes (years)',
 'Smokes (packs/year)',
 'Hormonal Contraceptives',
 'Hormonal Contraceptives (years)',
 'IUD',
 'IUD (years)',
 'STDs',
 'STDs (number)',
 'STDs:condylomatosis',
 'STDs:cervical condylomatosis',
 'STDs:vaginal condylomatosis',
 'STDs:vulvo-perineal condylomatosis',
 'STDs:syphilis',
 'STDs:pelvic inflammatory disease',
 'STDs:genital herpes',
 'STDs:molluscum contagiosum',
 'STDs:AIDS',
 'STDs:HIV',
 'STDs:Hepatitis B',
 'STDs:HPV',
 'STDs: Number of diagnosis',
 'STDs: Time since first diagnosis',
 'STDs: Time since last diagnosis',
 'Dx:Cancer',
 'Dx:CIN',
 'Dx:HPV',
 'Dx',
 'Hinselmann',
 'Schiller',
 'Citology',
 'Biopsy']

In [19]:
table_df = dataframe.select(dataframe.columns)
table_df.show()

+---+-------------------------+------------------------+------------------+------+--------------+-------------------+-----------------------+-------------------------------+----+-----------+----+-------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+--------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+-------------------------+--------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
|Age|Number of sexual partners|First sexual intercourse|Num of pregnancies|Smokes|Smokes (years)|Smokes (packs/year)|Hormonal Contraceptives|Hormonal Contraceptives (years)| IUD|IUD (years)|STDs|STDs (number)|STDs:condylomatosis|STDs:cervical condylomatosis|STDs:vaginal condylomatosis|STDs:vulvo-perineal condylomatosis|STDs:syphilis|STDs:pelvic inflammatory disease|STDs:genital her

In [90]:
### Rename Columns
dataframe1 = dataframe.select(['Age',
 'Number of sexual partners',
 'First sexual intercourse',
 'Num of pregnancies',
 'Smokes',
 'Smokes (years)',
 'Smokes (packs/year)',
 'Hormonal Contraceptives',
 'Hormonal Contraceptives (years)',
 'IUD',
 'IUD (years)',
 'STDs',
 'STDs (number)',
 'STDs:condylomatosis',
 'STDs:cervical condylomatosis',
 'STDs:vaginal condylomatosis',
 'STDs:vulvo-perineal condylomatosis',
 'STDs:syphilis',
 'STDs:pelvic inflammatory disease',
 'STDs:genital herpes',
 'STDs:molluscum contagiosum',
 'STDs:AIDS',
 'STDs:HIV',
 'STDs:Hepatitis B',
 'STDs:HPV',
 'STDs: Number of diagnosis',
 'STDs: Time since first diagnosis',
 'STDs: Time since last diagnosis',
 'Dx:Cancer',
 'Dx:CIN',
 'Dx:HPV',
 'Dx',
 'Hinselmann',
 'Schiller',
 'Citology',
 'Biopsy'])
renamed_df = dataframe1.withColumnRenamed("Age", "age")\
      .withColumnRenamed("Number of sexual partners", "num_of_sexual_partners")\
      .withColumnRenamed("First sexual intercourse", "first_sexual_intercourse")\
      .withColumnRenamed("Num of pregnancies", "num_of_pregnancies")\
      .withColumnRenamed("Smokes", "smokes")\
      .withColumnRenamed("Smokes (years)", "smokes_years")\
      .withColumnRenamed("Smokes (packs/year)", "smokes_packs_per_year")\
      .withColumnRenamed("Hormonal Contraceptives", "hormonal_contraceptives")\
      .withColumnRenamed("Hormonal Contraceptives (years)", "hormonal_contraceptives_years")\
      .withColumnRenamed("IUD", "iud")\
      .withColumnRenamed("IUD (years)", "iud_years")\
      .withColumnRenamed("STDs", "stds")\
      .withColumnRenamed("STDs (number)", "stds_number")\
      .withColumnRenamed("STDs:condylomatosis", "stds_condylomatosis")\
      .withColumnRenamed("STDs:cervical condylomatosis", "stds_cervical_condylomatosis")\
      .withColumnRenamed("STDs:vaginal condylomatosis", "stds_vaginal_condylomatosis")\
      .withColumnRenamed("STDs:vulvo-perineal condylomatosis", "stds_vulvo_perineal_condylomatosis")\
      .withColumnRenamed("STDs:syphilis", "stds_syphilis")\
      .withColumnRenamed("STDs:pelvic inflammatory disease", "stds_pelvic_inflamatory_disease")\
      .withColumnRenamed("STDs:genital herpes", "stds_genital_herpes")\
      .withColumnRenamed("STDs:molluscum contagiosum", "stds_molluscum_contagiosum")\
      .withColumnRenamed("STDs:AIDS", "stds_aids")\
      .withColumnRenamed("STDs:HIV", "stds_hiv")\
      .withColumnRenamed("STDs:Hepatitis B", "stds_hepatitis_b")\
      .withColumnRenamed("STDs:HPV", "stds_hpv")\
      .withColumnRenamed("STDs: Number of diagnosis", "stds_num_of_diagnosis")\
      .withColumnRenamed("STDs: Time since first diagnosis", "stds_time_since_first_diagnosis")\
      .withColumnRenamed("STDs: Time since last diagnosis", "stds_times_since_last_diagnosis")\
      .withColumnRenamed("Dx:Cancer", "dx_cancer")\
      .withColumnRenamed("Dx:CIN", "dx_cin")\
      .withColumnRenamed("Dx:HPV", "dx_hpv")\
      .withColumnRenamed("Dx", "dx")\
      .withColumnRenamed("Hinselmann", "hinselmann")\
      .withColumnRenamed("Schiller", "schiller")\
      .withColumnRenamed("Citology", "citology")\
      .withColumnRenamed("Biopsy", "biopsy")
renamed_df.show()



+---+----------------------+------------------------+------------------+------+------------+---------------------+-----------------------+-----------------------------+----+---------+----+-----------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+-------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+---------------------+-------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
|age|num_of_sexual_partners|first_sexual_intercourse|num_of_pregnancies|smokes|smokes_years|smokes_packs_per_year|hormonal_contraceptives|hormonal_contraceptives_years| iud|iud_years|stds|stds_number|stds_condylomatosis|stds_cervical_condylomatosis|stds_vaginal_condylomatosis|stds_vulvo_perineal_condylomatosis|stds_syphilis|stds_pelvic_inflamatory_disease|stds_genital_herpes|stds_molluscum_contag

In [87]:
renamed_df.columns

['age',
 'num_of_sexual_partners',
 'first_sexual_intercourse',
 'num_of_pregnancies',
 'smokes',
 'smokes(years)',
 'smokes(packs_per_year)',
 'hormonal_contraceptives',
 'hormonal_contraceptives(years)',
 'iud',
 'iud(years)',
 'stds',
 'stds_number',
 'stds_condylomatosis',
 'stds_cervical_condylomatosis',
 'stds_vaginal_condylomatosis',
 'stds_vulvo_perineal_condylomatosis',
 'stds_syphilis',
 'stds_pelvic_inflamatory_disease',
 'stds_genital_herpes',
 'stds_molluscum_contagiosum',
 'stds_aids',
 'stds_hiv',
 'stds_hepatitis_b',
 'stds_hpv',
 'stds_num_of_diagnosis',
 'stds_time_since_first_diagnosis',
 'stds_times_since_last_diagnosis',
 'dx_cancer',
 'dx_cin',
 'dx_hpv',
 'dx',
 'hinselmann',
 'schiller',
 'citology',
 'biopsy']

# Configuration for RDS instance

In [97]:
mode = "append"
jdbc_url="jdbc:postgresql://<endpoint>:5432/cancer_risk_db"
config = {"user":"<user>", "password": "<password>", "driver":"org.postgresql.Driver"}

# Write DataFrames to tables to AWS RDS

In [92]:
# Write review_id_df to table in RDS
renamed_df.write.jdbc(url=jdbc_url, table='cancer_risk', mode=mode, properties=config)

# Read table from AWS RDS

In [101]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.config('spark.driver.extraClassPath', '/path/to/postgresql.jar').getOrCreate()
url = 'jdbc:postgresql://<endpoint>:5432/cancer_risk_db'
properties = {'user': '<user>', 'password': '<password>'}
postgres_df = spark.read.jdbc(url=url, table='cancer_risk', properties=properties)
postgres_df.show()

+---+----------------------+------------------------+------------------+------+------------------+---------------------+-----------------------+-----------------------------+----+---------+----+-----------+-------------------+----------------------------+---------------------------+----------------------------------+-------------+-------------------------------+-------------------+--------------------------+---------+--------+----------------+--------+---------------------+-------------------------------+-------------------------------+---------+------+------+---+----------+--------+--------+------+
|age|num_of_sexual_partners|first_sexual_intercourse|num_of_pregnancies|smokes|      smokes_years|smokes_packs_per_year|hormonal_contraceptives|hormonal_contraceptives_years| iud|iud_years|stds|stds_number|stds_condylomatosis|stds_cervical_condylomatosis|stds_vaginal_condylomatosis|stds_vulvo_perineal_condylomatosis|stds_syphilis|stds_pelvic_inflamatory_disease|stds_genital_herpes|stds_moll

# Run Queries

In [102]:
smokes_df = postgres_df.select(["smokes", "smokes_years", "smokes_packs_per_year"])
smokes_df.show()

+------+------------------+---------------------+
|smokes|      smokes_years|smokes_packs_per_year|
+------+------------------+---------------------+
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   1.0|              37.0|                 37.0|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   1.0|              34.0|   3.4000000953674316|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   1.0|1.2669728994369507|    2.799999952316284|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|
|   0.0|               0.0|                  0.0|


In [103]:
postgres_df.summary().show()

+-------+-----------------+----------------------+------------------------+------------------+-------------------+------------------+---------------------+-----------------------+-----------------------------+-------------------+------------------+-------------------+-------------------+-------------------+----------------------------+---------------------------+----------------------------------+-------------------+-------------------------------+--------------------+--------------------------+---------+-------------------+--------------------+--------------------+---------------------+-------------------------------+-------------------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+------------------+
|summary|              age|num_of_sexual_partners|first_sexual_intercourse|num_of_pregnancies|             smokes|      smokes_years|smokes_packs_per_year|hormonal_contraceptives|ho