## Setup Connection to S3 Bucket: 

In [89]:
import boto3
import sagemaker
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [90]:
%%bash

aws s3 ls s3://${bucket}/

2023-03-04 00:23:32 sagemaker-studio-328876407652-1zzlgzyu0f7
2023-03-04 00:28:42 sagemaker-us-east-1-328876407652
2023-03-18 04:47:03 testfinalproject088


In [91]:
# To list the S3 Bucket content
!aws s3 ls s3://sagemaker-us-east-1-993410942383/content-project/airline_data/

                           PRE Airline_Customer_Service/
                           PRE WeatherUS_2016/
                           PRE airlineID_carrier_table/
                           PRE airline_flights_table/
                           PRE airport_codes_table/
                           PRE cancellation_table/
2023-03-12 22:40:29  941254313 WeatherEvents_Jan2016-Dec2021.csv


## Set S3 Source Location (Public S3 Bucket):

In [5]:
s3_public_path = "s3://sagemaker-us-east-1-993410942383/content-project/airline_data/"

In [6]:
%store s3_public_path

Stored 's3_public_path' (str)


## Set S3 Destination Location (Our Private S3 Bucket):

In [7]:
s3_private_path = "s3://testfinalproject088/content/".format(bucket)
print(s3_private_path)

s3://testfinalproject088/content/


In [8]:
%store s3_private_path

Stored 's3_private_path' (str)


## Create Athena Database & Establish Connection: 

In [6]:
!pip install --disable-pip-version-check -q PyAthena==2.1.0

[0m

In [134]:
# Import PyAthena
from pyathena import connect

In [157]:
ingest_create_athena_db_passed = False

In [158]:
%store -r ingest_create_athena_table_parquet_passed

no stored variable or alias ingest_create_athena_table_parquet_passed


In [159]:
# Create Athena Database
database_name = "testanalysis"

# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [160]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [161]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS testanalysis


In [162]:
# Verify The Database Has Been Created Succesfully
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,default
1,testanalysis


## Create Tables For Athena Database: 

### Cancellation Codes Table -

In [123]:
cancellation_s3 = 's3://sagemaker-us-east-1-993410942383/content-project/airline_data/cancellation_table'

In [114]:
table1_name = "cancellation_code"

statement1 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         Index string,
         Code string,
         Description string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\,' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table1_name, cancellation_s3
)

In [115]:
pd.read_sql(statement1, conn)

In [150]:
# Verify the table was created by way of a query
query1 = "SELECT * FROM {}.{}".format(database_name, table1_name)
pd.read_sql(query1, conn)

Unnamed: 0,index,code,description
0,0,A,Carrier
1,1,B,Weather
2,2,C,National Air System
3,3,D,Security


### Airlline Customer Service Table -

In [163]:
customer_service_s3 = 's3://sagemaker-us-east-1-993410942383/content-project/airline_data/Airline_Customer_Service/'

In [164]:
table2_name = "airline_customer_service"

statement2 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         Carrier string,
         Commitment string,
         Provided string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\,' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table2_name, customer_service_s3
)

In [165]:
pd.read_sql(statement2, conn)

In [166]:
query2 = "SELECT * FROM {}.{}".format(database_name, table2_name)
pd.read_sql(query2, conn)

Unnamed: 0,carrier,commitment,provided


### Airline Carrier ID Table - 

In [82]:
carrier_id_s3 = 's3://sagemaker-us-east-1-993410942383/content-project/airline_data/airlineID_carrier_table/airlineID_carrier_table.csv'

In [83]:
table3_name = "airline_carrier_id"

statement3 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         Index string,
         Code string,
         Description string,
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\,' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table3_name, carrier_id_s3
)

In [84]:
pd.read_sql(statement3, conn)

Failed to execute query.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/pyathena/common.py", line 305, in _execute
    **request
  File "/opt/conda/lib/python3.7/site-packages/pyathena/util.py", line 84, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 379, in __call__
    do = self.iter(retry_state=retry_state)
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 314, in iter
    return fut.result()
  File "/opt/conda/lib/python3.7/concurrent/futures/_base.py", line 428, in result
    return self.__get_result()
  File "/opt/conda/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
    raise self._exception
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 382, in __call__
    result = fn(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/botocore/client.py", line 530, in _api_call
    return

DatabaseError: Execution failed on sql: CREATE EXTERNAL TABLE IF NOT EXISTS testanalysis.airline_carrier_id(
         Index string,
         Code string,
         Description string,
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\,' LINES TERMINATED BY '\n' LOCATION 's3://sagemaker-us-east-1-993410942383/content-project/airline_data/airlineID_carrier_table/airlineID_carrier_table.csv'
TBLPROPERTIES ('skip.header.line.count'='1')
An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'OR', 'SCHEMA', 'TABLE', 'VIEW'
unable to rollback

In [34]:
query = "SELECT * FROM {}.{}".format(database_name, table_name)
pd.read_sql(query, conn)

Unnamed: 0,index,code,description
0,0,A,Carrier
1,1,B,Weather
2,2,C,National Air System
3,3,D,Security


### Airport Codes Table -

In [128]:
airport_codes_s3 = 's3://sagemaker-us-east-1-993410942383/content-project/airline_data/airport_codes_table'

In [154]:
table4_name = "airline_codes"

statement4 = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         Index string,
         Code string,
         Description string
) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\,' LINES TERMINATED BY '\\n' LOCATION '{}'
TBLPROPERTIES ('skip.header.line.count'='1')""".format(
    database_name, table4_name, airport_codes_s3
)

In [155]:
pd.read_sql(statement4, conn)

In [156]:
query4 = "SELECT * FROM {}.{}".format(database_name, table4_name)
pd.read_sql(query4, conn)

Unnamed: 0,index,code,description
0,0,01A,"""Afognak Lake"
1,1,03A,"""Granite Mountain"
2,2,04A,"""Lik"
3,3,05A,"""Little Squaw"
4,4,06A,"""Kizhuyak"
...,...,...,...
6380,6380,ZXZ,"""Waterville"
6381,6381,ZYL,"""Sylhet"
6382,6382,ZZU,"""Mzuzu"
6383,6383,ZZV,"""Zanesville"


## Visualize Dataset: 

In [16]:
df = pd.read_csv("s3://sagemaker-us-east-1-993410942383/content-project/airline_data/airport_codes_table.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Code,Description
0,0,01A,"Afognak Lake, AK: Afognak Lake Airport"
1,1,03A,"Granite Mountain, AK: Bear Creek Mining Strip"
2,2,04A,"Lik, AK: Lik Mining Camp"
3,3,05A,"Little Squaw, AK: Little Squaw Airport"
4,4,06A,"Kizhuyak, AK: Kizhuyak Bay"


In [20]:
df2 = pd.read_csv("s3://sagemaker-us-east-1-993410942383/content-project/airline_data/cancellation_table/CancellationCode_table.csv")
df2.head()

Unnamed: 0.1,Unnamed: 0,Code,Description
0,0,A,Carrier
1,1,B,Weather
2,2,C,National Air System
3,3,D,Security


## Possible questions to explore includes:

1. What is the most common reason for flights being cancelled? 

### 1. What is the most common reason for flights being cancelled? 

In [None]:
# SQL statement
statement = """
SELECT product_category, AVG(star_rating) AS avg_star_rating
FROM {}.{} 
GROUP BY product_category 
ORDER BY avg_star_rating DESC
""".format(
    database_name, table_name
)

print(statement)