# Reading / Writing files from HDFS in CDSW

In [None]:
%%bash
## Cleanup - delete file if exits 
hdfs dfs -rm -f /tmp/1988.csv.bz2
hdfs dfs -rm -f /tmp/airports.csv
hdfs dfs -rm -f /tmp/airports_python.csv

## Option 1 : Use HDF CLI (from the command line or python)
### Copy data To and From the local file system and work from there
#### **Applicable to:** **"small and medium"** datasets ; ie : small enough to be easily managed/processed locally. <br>
**NOTE:** To be used in the context of **sensitive data and/or data that requiere strong governance** as it creates a break in the data chain of custody / security. <br> 
This is not a best practice in terms of Data Management/Governance.

### 1. Copy data TO HDFS from local file system 
#### Using the command line (works in workbench as well)

In [2]:
%%bash
### Copy to HDFS - using HDFS client - from Bash
# NOTE: Only functional from a Bash script or an interactive session 
hdfs dfs -copyFromLocal -f /home/cdsw/airlines/airports/airports.csv /tmp/
hdfs dfs -ls /tmp

Found 4 items
d---------   - hdfs    supergroup          0 2020-03-08 21:50 /tmp/.cloudera_health_monitoring_canary_files
-rw-r--r--   2 systest supergroup     244438 2020-03-08 21:51 /tmp/airports.csv
drwxrwxrwx   - hive    supergroup          0 2020-03-06 18:30 /tmp/hive
drwxrwxrwt   - mapred  hadoop              0 2020-03-06 18:24 /tmp/logs




#### Using Python - Write data to the HDFS CLI (via subprocess)

In [3]:
### local and HDFS Paths 
local_path="/home/cdsw/airlines/airports/airports.csv"
hdfs_path="/tmp/airports_python.csv"

In [4]:
### Copy to HDFS - Using HDFS Client - From Python ( using subprocess )
from subprocess import Popen, PIPE
import sys

def hdfs_write(local_path,hdfs_path):
    ### Copy to HDFS - Python (using subprocess)
    from subprocess import Popen, PIPE
    put = Popen(["hadoop","fs","-put","-f", local_path, hdfs_path], stdin=PIPE,stdout=PIPE,stderr=PIPE)
    stdout, stderr = put.communicate()
    
    ## Error handling
    if put.returncode != 0: 
        raise IOError(stderr)

hdfs_write(local_path,hdfs_path)

#Show hdfs path
!hdfs dfs -ls /tmp

Found 5 items
d---------   - hdfs    supergroup          0 2020-03-08 21:50 /tmp/.cloudera_health_monitoring_canary_files
-rw-r--r--   2 systest supergroup     244438 2020-03-08 21:51 /tmp/airports.csv
-rw-r--r--   2 systest supergroup     244438 2020-03-08 21:51 /tmp/airports_python.csv
drwxrwxrwx   - hive    supergroup          0 2020-03-06 18:30 /tmp/hive
drwxrwxrwt   - mapred  hadoop              0 2020-03-06 18:24 /tmp/logs


####  Tip: Use a pipe to download files directly to HDFS (command line)

In [5]:
%%bash
## NOTE: Only functional from a Bash script or an interactive session 
export DOWNLOAD_LINK='https://mlamairesse.s3-eu-west-1.amazonaws.com/Airlines_Dataset/1988.csv.bz2'
curl $DOWNLOAD_LINK | hadoop fs -put - /tmp/1988.csv.bz2

hdfs dfs -ls /tmp

Found 6 items
d---------   - hdfs    supergroup          0 2020-03-08 21:50 /tmp/.cloudera_health_monitoring_canary_files
-rw-r--r--   2 systest supergroup   49499025 2020-03-08 21:51 /tmp/1988.csv.bz2
-rw-r--r--   2 systest supergroup     244438 2020-03-08 21:51 /tmp/airports.csv
-rw-r--r--   2 systest supergroup     244438 2020-03-08 21:51 /tmp/airports_python.csv
drwxrwxrwx   - hive    supergroup          0 2020-03-06 18:30 /tmp/hive
drwxrwxrwt   - mapred  hadoop              0 2020-03-06 18:24 /tmp/logs


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0 47.2M    0 85614    0     0  46153      0  0:17:52  0:00:01  0:17:51 46153  5 47.2M    5 2443k    0     0   988k      0  0:00:48  0:00:02  0:00:46  988k 30 47.2M   30 14.4M    0     0  4281k      0  0:00:11  0:00:03  0:00:08 4280k 56 47.2M   56 26.5M    0     0  6116k      0  0:00:07  0:00:04  0:00:03 6115k 81 47.2M   81 38.6M    0     0  7283k      0  0:00:06  0:00:05  0:00:01 8044k100 47.2M  100 47.2M    0     0  7974k      0  0:00:06  0:00:06 --:--:-- 11.2M


### 2. Copy data FROM HDFS to the local file sytem 
#### Command line 

In [6]:
## Copy the file from HDFS
!hdfs dfs -get airlines/airports/airports.csv /home/cdsw/airlines/from_hdfs/
#show content of directory
!ls -lh ~/airlines/from_hdfs/

get: `/home/cdsw/airlines/from_hdfs/airports.csv': File exists
total 240K
-rw-r--r-- 1 cdsw cdsw 239K Mar  8 20:55 airports.csv


### 3. Read Data FROM HDFS

#### Using the console ( get a feel for what data looks like ) 
**NOTE:** using `-text` rather than `-cat` allows reading from compressed files (zip,gz,bz2,...)

In [7]:
!hdfs dfs -text airlines/airports/airports.csv | head -n 5

"iata","airport","city","state","country","lat","long"
"00M","Thigpen ","Bay Springs","MS","USA",31.95376472,-89.23450472
"00R","Livingston Municipal","Livingston","TX","USA",30.68586111,-95.01792778
"00V","Meadow Lake","Colorado Springs","CO","USA",38.94574889,-104.5698933
"01G","Perry-Warsaw","Perry","NY","USA",42.74134667,-78.05208056
text: Unable to write to output stream.


#### Using Python - Read data from HDFS using the HDFS CLI (via subprocess) 

In [11]:
from subprocess import Popen, PIPE
import sys

def hdfs_read(hdfs_path):
    proc = Popen(['hadoop', 'fs', '-text' , hdfs_path], stdout=PIPE, stderr=PIPE, universal_newlines=False)
    stdout, stderr = proc.communicate()
    
    if proc.returncode != 0:
        if 'No such file or directory' in stderr.decode('utf-8'):
            raise FileNotFoundError('No such file or directory: {}'.format(local_path))
        else : 
            raise IOError(stderr)
    
    return stdout
    #Return a "bytes" object ; 

HDFS_file = hdfs_read("airlines/airports/airports.csv").decode('utf-8') #decode as a string - File in Memory
HDFS_file[0:50] # get first 50 char

'"iata","airport","city","state","country","lat","l'

#### Using Pandas - Read data from HDFS using HDFS CLI (via subprocess) 


In [17]:
### NOTE: Can be read directly from Pandas with a bit of transformation: 
### Pandas accepts file, path or StringIO object 
### https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
from io import StringIO
import pandas as pd 
airlines_pd_df = pd.read_csv(StringIO(hdfs_read("airlines/airports/airports.csv").decode('utf-8')),
                             sep=',', delimiter=None, header='infer')
airlines_pd_df.head()

Unnamed: 0,iata,airport,city,state,country,lat,long
0,00M,Thigpen,Bay Springs,MS,USA,31.953765,-89.234505
1,00R,Livingston Municipal,Livingston,TX,USA,30.685861,-95.017928
2,00V,Meadow Lake,Colorado Springs,CO,USA,38.945749,-104.569893
3,01G,Perry-Warsaw,Perry,NY,USA,42.741347,-78.052081
4,01J,Hilliard Airpark,Hilliard,FL,USA,30.688012,-81.905944


## Option 2 - Read/Write using spark
### **Applicable to:**  All datasets and large ones in particular <br> 
Using spark, allows us to use the data without having to copy first. It's much cleaner in terms of chain of custody <br>
**NOTE:** For large dataset, it also allows us to do filtering, pre-processing and filtering in a distributed manner which is much more efficient. 

### 1. Reading data from HDFS

#### 1.1 Start the spark session
Custom session configuration can be defined either in the session parameters as below OR
inside a `spark-defaults.conf` file stored at the root of the project (in which case the configs become project wide)

In [18]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .master("yarn")\
    .appName("Airline")\
    .config("spark.executor.memory","2g")\
    .config("spark.executor.cores","2")\
    .config("spark.driver.memory","2g")\
    .config("spark.executor.instances","2")\
    .getOrCreate()

In [19]:
## Adding a link to the Spark UI for demo purposes
## Also available in the session tab
from IPython.core.display import HTML
import os
HTML('<a href="http://spark-{}.{}" target="_blank" >Spark UI</a>'.\
    format(os.getenv("CDSW_ENGINE_ID"),os.getenv("CDSW_DOMAIN")))

#### 1.2 Read Data - CSV file stored on HDFS 

In [20]:
path='airlines/airports/airports.csv' #HDFS location (relative path)

airports_df = spark.read.csv(
    path=path,
    header=True,
    sep=',',
    inferSchema=True,
    nullValue=None
)
airports_df.printSchema()

root
 |-- iata: string (nullable = true)
 |-- airport: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)



**Note** : in the above example, I'm infering the schema from the file. <br>
In a re-usable script, it's actually good practice to set the schema to prevent erroneous type casting

In [21]:
from pyspark.sql.types import *

path='airlines/airports/airports.csv' #HDFS location
schema = StructType([StructField("iata", StringType(), True),
                     StructField("airport", StringType(), True),
                     StructField("city", StringType(), True),
                     StructField("state", StringType(), True),
                     StructField("country", StringType(), True),
                     StructField("lat",  DoubleType(), True),
                     StructField("long",  DoubleType(), True)
                    ])

airports_df = spark.read.csv(
    path=path,
    schema=schema,
    header=True,
    sep=',',
    nullValue=None) 
airports_df.show(5)

+----+--------------------+----------------+-----+-------+-----------+------------+
|iata|             airport|            city|state|country|        lat|        long|
+----+--------------------+----------------+-----+-------+-----------+------------+
| 00M|            Thigpen |     Bay Springs|   MS|    USA|31.95376472|-89.23450472|
| 00R|Livingston Municipal|      Livingston|   TX|    USA|30.68586111|-95.01792778|
| 00V|         Meadow Lake|Colorado Springs|   CO|    USA|38.94574889|-104.5698933|
| 01G|        Perry-Warsaw|           Perry|   NY|    USA|42.74134667|-78.05208056|
| 01J|    Hilliard Airpark|        Hilliard|   FL|    USA| 30.6880125|-81.90594389|
+----+--------------------+----------------+-----+-------+-----------+------------+
only showing top 5 rows



#### 1.3 (optional) Transform data to Pandas Dataframe
#### Once converted **ALL DATA will be brought locally** and distributed processing ends 
* **Applicable to : SMALL to MEDIUM size datasets** - ie : datasets that can be easily managed/processed locally
* When working with **LARGE datasets** :  **data should be sampled** before bringing it locally

> **Good Practice**:  Spark context should be stopped `spark.stop()` to release cluster ressources once data is copied

In [23]:
#without sampling
import pandas 
airport_pandas_df = airports_df.toPandas()
airport_pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3376 entries, 0 to 3375
Data columns (total 7 columns):
iata       3376 non-null object
airport    3376 non-null object
city       3376 non-null object
state      3376 non-null object
country    3376 non-null object
lat        3376 non-null float64
long       3376 non-null float64
dtypes: float64(2), object(5)
memory usage: 184.8+ KB


In [24]:
#with sampling
sample_pandas_df = airports_df.sample(1/3,seed=30).toPandas()
sample_pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111 entries, 0 to 1110
Data columns (total 7 columns):
iata       1111 non-null object
airport    1111 non-null object
city       1111 non-null object
state      1111 non-null object
country    1111 non-null object
lat        1111 non-null float64
long       1111 non-null float64
dtypes: float64(2), object(5)
memory usage: 60.9+ KB


### 2. Write data from PANDAS to HDFS - Using Spark

#### 2.1 Read data locally using Pandas

In [25]:
## read from pandas
import pandas as pd
airlines_pd_df = pd.read_csv("/home/cdsw/airlines/airports/airports.csv",sep=',', delimiter=None, header='infer')
airlines_pd_df.sort_values(by=['state','airport'],inplace=True) # ordering to keep same visulisation order as below
airlines_pd_df.head()

Unnamed: 0,iata,airport,city,state,country,lat,long
776,ADK,Adak,Adak,AK,USA,51.877964,-176.646031
818,AKK,Akhiok,Akhiok,AK,USA,56.938691,-154.182556
3363,Z13,Akiachak,Akiachak,AK,USA,60.904532,-161.42091
817,AKI,Akiak,Akiak,AK,USA,60.904812,-161.227019
1994,KQA,Akutan SPB,Akutan,AK,USA,54.132467,-165.785311


#### 2.2 Transform Pandas DataFrame to Spark DataFrame
With spark 2.3 and up, integration with Pandas has been reinforced notably with the use of Arrow for faster data transfers [https://issues.apache.org/jira/browse/SPARK-20791]

In [34]:
# #(optional) Enable Arrow-based optimised columnar data transfers ; Note : still marked as experimental
# #PyArrow must be intalled on all spark nodes
# #https://spark.apache.org/docs/latest/sql-pyspark-pandas-with-arrow.html#ensure-pyarrow-installed
# spark.conf.set("spark.sql.execution.arrow.enabled", "true") # Note : Compatible only with pyarrow 0.8.0 

#(optional) good practice to define schema to prevent any type casting errors
from pyspark.sql.types import *

schema = StructType([StructField("iata", StringType(), True),
                     StructField("airport", StringType(), True),
                     StructField("city", StringType(), True),
                     StructField("state", StringType(), True),
                     StructField("country", StringType(), True),
                     StructField("lat",  DoubleType(), True),
                     StructField("long",  DoubleType(), True)
                    ])

spark_df=spark.createDataFrame(airlines_pd_df,schema=schema)
spark_df.orderBy(['state','airport']).show(5) # ordering to keep same visulisation order 

+----+----------+--------+-----+-------+-----------+-------------------+
|iata|   airport|    city|state|country|        lat|               long|
+----+----------+--------+-----+-------+-----------+-------------------+
| ADK|      Adak|    Adak|   AK|    USA|51.87796389|       -176.6460306|
| AKK|    Akhiok|  Akhiok|   AK|    USA|56.93869083|       -154.1825556|
| Z13|  Akiachak|Akiachak|   AK|    USA|60.90453167|-161.42091000000002|
| AKI|     Akiak|   Akiak|   AK|    USA|60.90481194|       -161.2270189|
| KQA|Akutan SPB|  Akutan|   AK|    USA|54.13246694|       -165.7853111|
+----+----------+--------+-----+-------+-----------+-------------------+
only showing top 5 rows



In [None]:
# example of a typecasting error... 
spark_df=spark.createDataFrame(airlines_pd_df)

#### 2.3 Write to HDFS - using Spark

In [8]:
## It's good practice to restructure data before writing to HDFS : Spark write a file by partition. 
## this can lead to lots of small files which is counterproductive both for read and write. 
## Re-organize data using the "coalesce" function to define the number of files to be saved
spark_df.coalesce(2).write.parquet('/tmp/airlines/', mode='overwrite')

!hdfs dfs -ls /tmp/airlines/

Found 3 items
-rw-r--r--   2 systest supergroup          0 2020-01-07 11:53 /tmp/airlines/_SUCCESS
-rw-r--r--   2 systest supergroup      72942 2020-01-07 11:53 /tmp/airlines/part-00000-cc364f67-10e5-46e6-8468-d85f79b60ef3-c000.snappy.parquet
-rw-r--r--   2 systest supergroup      72504 2020-01-07 11:53 /tmp/airlines/part-00001-cc364f67-10e5-46e6-8468-d85f79b60ef3-c000.snappy.parquet


#### 2.4 Write Data to Hive - using Spark
Spark to hive integration makes it very easy to interact with the cluster. 

In [35]:
# Note : Ordering on write can help optimise reads later on. 
spark_df.orderBy(['state','airport']).coalesce(2)\
    .write.format('parquet').mode("overwrite")\
    .saveAsTable('default.airports')

### 3. Read Data from Hive 
All hive configurations are already injected into spark.  Therefore Hive can be called directly using a spark sql context.

#### 3.1 Read data from hive

In [39]:
# use *spark.sql("some SQL statement")* to execture queries against hive
sql_statement = '''show tables in default'''
spark.sql(sql_statement).show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default| airports|      false|
| default|customers|      false|
| default|sample_07|      false|
| default|sample_08|      false|
| default| web_logs|      false|
+--------+---------+-----------+



In [40]:
# Ex. read table
sql_statement = '''select * from default.airports where state = "AK" '''
airports_df = spark.sql(sql_statement)
airports_df.show(10)

+----+----------+--------------+-----+-------+-----------+-------------------+
|iata|   airport|          city|state|country|        lat|               long|
+----+----------+--------------+-----+-------+-----------+-------------------+
| ADK|      Adak|          Adak|   AK|    USA|51.87796389|       -176.6460306|
| AKK|    Akhiok|        Akhiok|   AK|    USA|56.93869083|       -154.1825556|
| Z13|  Akiachak|      Akiachak|   AK|    USA|60.90453167|-161.42091000000002|
| AKI|     Akiak|         Akiak|   AK|    USA|60.90481194|       -161.2270189|
| KQA|Akutan SPB|        Akutan|   AK|    USA|54.13246694|       -165.7853111|
| AUK|  Alakanuk|      Alakanuk|   AK|    USA|62.68004417|       -164.6599253|
| 5A8| Aleknagik|     Aleknagik|   AK|    USA|59.28256167|       -158.6176725|
| 6A8| Allakaket|     Allakaket|   AK|    USA|66.55194444|-152.62222219999998|
| BIG| Allen AAF|Delta Junction|   AK|    USA|63.99454722|       -145.7216417|
| AFM|    Ambler|        Ambler|   AK|    USA|67.106

#### 3.2 (optional) convert to pandas dataframe (see 1.3)

In [41]:
#(OPTIONAL) convert to pandas 
airlines_pd_df = airports_df.toPandas()
airlines_pd_df.head()

Unnamed: 0,iata,airport,city,state,country,lat,long
0,ADK,Adak,Adak,AK,USA,51.877964,-176.646031
1,AKK,Akhiok,Akhiok,AK,USA,56.938691,-154.182556
2,Z13,Akiachak,Akiachak,AK,USA,60.904532,-161.42091
3,AKI,Akiak,Akiak,AK,USA,60.904812,-161.227019
4,KQA,Akutan SPB,Akutan,AK,USA,54.132467,-165.785311


In [42]:
spark.stop() ## Release spark ressources

#### ***NOTE:*** Pandas Dataframe is still available

In [43]:
airport_pandas_df.head()

Unnamed: 0,iata,airport,city,state,country,lat,long
0,00M,Thigpen,Bay Springs,MS,USA,31.953765,-89.234505
1,00R,Livingston Municipal,Livingston,TX,USA,30.685861,-95.017928
2,00V,Meadow Lake,Colorado Springs,CO,USA,38.945749,-104.569893
3,01G,Perry-Warsaw,Perry,NY,USA,42.741347,-78.052081
4,01J,Hilliard Airpark,Hilliard,FL,USA,30.688012,-81.905944


## 4 - Read Directly from HIVE - over JDBC 
**NOTE:** Must know Hive host and port (default 10000) information

**To Do before running** : Replace **hive_host** and **hive_port** information with Hive host information pertinent to the cluster used

In [21]:
hive_host='mlamairesse-1.vpc.cloudera.com'
hive_port=10000 #default 10000

### 4.1 using PyHive

In [47]:
# create a connection

# doc for connection string : 
#connection=hive.Connection(host=<hive_host>, port=<hive_port (default=10000)>, auth='KERBEROS', kerberos_service_name='hive')

connection=hive.Connection(host=hive_host, port=hive_port, auth='KERBEROS', kerberos_service_name='hive')
type(connection)

pyhive.hive.Connection

In [48]:
# Basic reading - using cursor
cur = connection.cursor()
cur.execute('Select * from default.airports LIMIT 20')
results = cur.fetchall()
for line in results : 
    print(line)

('ADK', 'Adak', 'Adak', 'AK', 'USA', 51.87796389, -176.6460306)
('AKK', 'Akhiok', 'Akhiok', 'AK', 'USA', 56.93869083, -154.1825556)
('Z13', 'Akiachak', 'Akiachak', 'AK', 'USA', 60.90453167, -161.42091000000002)
('AKI', 'Akiak', 'Akiak', 'AK', 'USA', 60.90481194, -161.2270189)
('KQA', 'Akutan SPB', 'Akutan', 'AK', 'USA', 54.13246694, -165.7853111)
('AUK', 'Alakanuk', 'Alakanuk', 'AK', 'USA', 62.68004417, -164.6599253)
('5A8', 'Aleknagik', 'Aleknagik', 'AK', 'USA', 59.28256167, -158.6176725)
('6A8', 'Allakaket', 'Allakaket', 'AK', 'USA', 66.55194444, -152.62222219999998)
('BIG', 'Allen AAF', 'Delta Junction', 'AK', 'USA', 63.99454722, -145.7216417)
('AFM', 'Ambler', 'Ambler', 'AK', 'USA', 67.10610472, -157.85362030000002)
('AKP', 'Anaktuvuk Pass', 'Anaktuvuk Pass', 'AK', 'USA', 68.1343225, -151.74168)
('AGN', 'Angoon SPB', 'Angoon', 'AK', 'USA', 57.50355528, -134.5850939)
('ANI', 'Aniak', 'Aniak', 'AK', 'USA', 61.58159694, -159.5430428)
('ANV', 'Anvik', 'Anvik', 'AK', 'USA', 62.64858333,

In [49]:
### Using Pandas API ###
# Pandas accepts db connection as a reader 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html
import pandas as pd
airlines_pd_df = pd.read_sql('select * from default.airports limit 20',connection)
airlines_pd_df.head()

Unnamed: 0,airports.iata,airports.airport,airports.city,airports.state,airports.country,airports.lat,airports.long
0,ADK,Adak,Adak,AK,USA,51.877964,-176.646031
1,AKK,Akhiok,Akhiok,AK,USA,56.938691,-154.182556
2,Z13,Akiachak,Akiachak,AK,USA,60.904532,-161.42091
3,AKI,Akiak,Akiak,AK,USA,60.904812,-161.227019
4,KQA,Akutan SPB,Akutan,AK,USA,54.132467,-165.785311


In [51]:
connection.close() #don't forget to close the connection !!

### 4.1. Read using SQLAlchemy
##### https://docs.sqlalchemy.org/en/13/core/connections.html

#### 4.1 Create an engine and connect

In [52]:
#with SQLAlchemy
from sqlalchemy import create_engine
hive_host='mlamairesse-1.vpc.cloudera.com'
hive_port='10000' #default 10000

# Doc for connnection string : 
# engine = create_engine("hive://<kerberos-username>@<hive-host>:<hive-port>/<db-name>",connect_args={'auth': 'KERBEROS','kerberos_service_name': 'hive'})

engine = create_engine("hive://"+hive_host+":"+hive_port+"/default",
                     connect_args={'auth': 'KERBEROS','kerberos_service_name': 'hive'})
connection = engine.connect()
type(connection)

sqlalchemy.engine.base.Connection

In [53]:
## basic reading 
data = connection.execute('select * from default.airports LIMIT 20')
for row in data : 
    print(row)

('ADK', 'Adak', 'Adak', 'AK', 'USA', 51.87796389, -176.6460306)
('AKK', 'Akhiok', 'Akhiok', 'AK', 'USA', 56.93869083, -154.1825556)
('Z13', 'Akiachak', 'Akiachak', 'AK', 'USA', 60.90453167, -161.42091000000002)
('AKI', 'Akiak', 'Akiak', 'AK', 'USA', 60.90481194, -161.2270189)
('KQA', 'Akutan SPB', 'Akutan', 'AK', 'USA', 54.13246694, -165.7853111)
('AUK', 'Alakanuk', 'Alakanuk', 'AK', 'USA', 62.68004417, -164.6599253)
('5A8', 'Aleknagik', 'Aleknagik', 'AK', 'USA', 59.28256167, -158.6176725)
('6A8', 'Allakaket', 'Allakaket', 'AK', 'USA', 66.55194444, -152.62222219999998)
('BIG', 'Allen AAF', 'Delta Junction', 'AK', 'USA', 63.99454722, -145.7216417)
('AFM', 'Ambler', 'Ambler', 'AK', 'USA', 67.10610472, -157.85362030000002)
('AKP', 'Anaktuvuk Pass', 'Anaktuvuk Pass', 'AK', 'USA', 68.1343225, -151.74168)
('AGN', 'Angoon SPB', 'Angoon', 'AK', 'USA', 57.50355528, -134.5850939)
('ANI', 'Aniak', 'Aniak', 'AK', 'USA', 61.58159694, -159.5430428)
('ANV', 'Anvik', 'Anvik', 'AK', 'USA', 62.64858333,

In [54]:
### Using Pandas API ###
# Pandas accepts db connection as a reader 
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html
import pandas as pd
airlines_pd_df = pd.read_sql('select * from default.airports limit 20',connection)
airlines_pd_df

Unnamed: 0,iata,airport,city,state,country,lat,long
0,ADK,Adak,Adak,AK,USA,51.877964,-176.646031
1,AKK,Akhiok,Akhiok,AK,USA,56.938691,-154.182556
2,Z13,Akiachak,Akiachak,AK,USA,60.904532,-161.42091
3,AKI,Akiak,Akiak,AK,USA,60.904812,-161.227019
4,KQA,Akutan SPB,Akutan,AK,USA,54.132467,-165.785311
5,AUK,Alakanuk,Alakanuk,AK,USA,62.680044,-164.659925
6,5A8,Aleknagik,Aleknagik,AK,USA,59.282562,-158.617672
7,6A8,Allakaket,Allakaket,AK,USA,66.551944,-152.622222
8,BIG,Allen AAF,Delta Junction,AK,USA,63.994547,-145.721642
9,AFM,Ambler,Ambler,AK,USA,67.106105,-157.85362


In [55]:
connection.close() ### Don't forget to close the connection