## Setup Dependencies

In [1]:
!pip install --disable-pip-version-check -q pip --upgrade > /dev/null
!pip install --disable-pip-version-check -q wrapt --upgrade > /dev/null

### AWS CLI and AWS Python SDK (boto3)

In [2]:
!pip install --disable-pip-version-check -q awscli boto3

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-multimodal 1.1.1 requires nvidia-ml-py3==7.352.0, which is not installed.
aiobotocore 2.13.3 requires botocore<1.34.163,>=1.34.70, but you have botocore 1.36.4 which is incompatible.
amazon-sagemaker-sql-magic 0.1.3 requires sqlparse==0.5.0, but you have sqlparse 0.5.3 which is incompatible.
autogluon-core 1.1.1 requires scikit-learn<1.4.1,>=1.3.0, but you have scikit-learn 1.5.2 which is incompatible.
autogluon-core 1.1.1 requires scipy<1.13,>=1.5.4, but you have scipy 1.14.1 which is incompatible.
autogluon-features 1.1.1 requires scikit-learn<1.4.1,>=1.3.0, but you have scikit-learn 1.5.2 which is incompatible.
autogluon-multimodal 1.1.1 requires jsonschema<4.22,>=4.18, but you have jsonschema 4.23.0 which is incompatible.
autogluon-multimodal 1.1.1 requires omegaconf<2.3.0,>=2.1.1, but you have 

### SageMaker

In [3]:
!pip install --disable-pip-version-check -q sagemaker
!pip install --disable-pip-version-check -q smdebug
!pip install --disable-pip-version-check -q sagemaker-experiments

### PyAthena

In [4]:
!pip install --disable-pip-version-check -q PyAthena

### AWS Data Wrangler

In [5]:
!pip install --disable-pip-version-check -q awswrangler

### Zip

In [6]:
!conda install -y zip

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - zip


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-25.1.0               |  py311h38be061_0         1.1 MB  conda-forge
    fastapi-0.115.7            |     pyhd8ed1ab_0          72 KB  conda-forge
    libxgboost-2.1.3           |   cpu_h3a1dfae_1         3.1 MB  conda-forge
    py-xgboost-2.1.3           | cpu_pyh1ce2f49_1         131 KB  conda-forge
    py-xgboost-cpu-2.1.3       |     pyhc1a9e11_1          15 KB  conda-forge
    xgboost-2.1.3              | cpu_pyhc1a9e11_1          15 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.4 MB

The following NEW packages will be INSTALLE

### Matplotlib

In [7]:
!pip install --disable-pip-version-check -q matplotlib

### Seaborn

In [8]:
!pip install --disable-pip-version-check -q seaborn

## Data lake Setup

In [9]:
import boto3
import sagemaker

session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
bucket = 'flightfinalapproachanomalydetection'#'sagemaker-us-east-1-817854913925' #sagemaker_session.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=region)

print("Bucket name: {}".format(bucket))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Bucket name: flightfinalapproachanomalydetection


### Verify Bucket Creation

In [10]:
from botocore.client import ClientError

response = None
try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e: 
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

{'ResponseMetadata': {'RequestId': '6G5RA4H54KXTRB0N', 'HostId': 'cK+UleLI+TS1RY9Pmn6xqORrpkM1OEQ4VzmHcgFKDhtxNgDjEojBsBLueTrKlJfQQJlpecJjvaj2YKK7XBxmT9fm1JMj8XkPArhG7YDwROQ=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'cK+UleLI+TS1RY9Pmn6xqORrpkM1OEQ4VzmHcgFKDhtxNgDjEojBsBLueTrKlJfQQJlpecJjvaj2YKK7XBxmT9fm1JMj8XkPArhG7YDwROQ=', 'x-amz-request-id': '6G5RA4H54KXTRB0N', 'date': 'Thu, 23 Jan 2025 02:49:58 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'BucketRegion': 'us-east-1', 'AccessPointAlias': False}


In [11]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name 
account_id = boto3.client(service_name="sagemaker", region_name=region)

In [12]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/flightfinalapproachanomalydetection/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

  from IPython.core.display import display, HTML


## Create Athena Database Scheme
### Import PyAthena

In [13]:
from pyathena import connect

In [14]:
table_name_csv = "flight_csv_data"
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

### Create Athena Database

In [15]:
database_name = "flightdata_db"

In [16]:
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [17]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [18]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS flightdata_db


In [19]:
cursor = conn.cursor()
cursor.execute(statement)
print("Database created successfully")


Database created successfully


In [20]:
import pandas as pd

statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,assignment2_aws
1,default
2,dsoaws
3,flightdata_db


## Register CSV with Athena

In [30]:
drop_statement = "DROP TABLE flightdata_db.flight_csv_data"
cursor = conn.cursor()
cursor.execute(drop_statement)
print("Table dropped successfully.")

Table dropped successfully.


In [31]:
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS flightdata_db.flight_csv_data (
        sample_id int,
        timestep int,
        aileron_pos_lh_deg float,
        aileron_pos_rh_deg float,
        corrected_angle_of_attack_deg float,
        baro_correct_alt_lsp_ft float,
        computed_airspeed_lsp_knots float,
        selected_course_deg float,
        drift_angle_deg float,
        elevator_pos_left_deg float,
        te_flap_pos_disc float,
        glideslope_dev_perc float,
        selected_heading_deg float,
        localizer_dev_perc float,
        core_speed_avg_perc float,
        total_pressure_lsp_millibar float,
        pitch_angle_lsp_deg float,
        roll_angle_lsp_deg float,
        rudder_pos_deg float,
        true_heading_lsp_deg float,
        vertical_accel_g float,
        wind_speed_knots float,
        label int
)
STORED AS TEXTFILE
LOCATION 's3://flightfinalapproachanomalydetection/csv/'
TBLPROPERTIES (
    'skip.header.line.count'='1',
    'field.delim'=',',
    'compressionType'='none'
)"""

In [32]:
cursor = conn.cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f03d5fab210>

In [33]:
import pandas as pd
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [34]:
statement = "SHOW TABLES IN {}".format(database_name)
cursor = conn.cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f03d605bdd0>

In [35]:
if table_name_csv in df_show.values:
    ingest_create_athena_table_csv_passed = True
else:
    ingest_create_athena_table_csv_passed = False
print(ingest_create_athena_table_csv_passed)

False


In [36]:
print(database_name)
print(table_name_csv)

flightdata_db
flight_csv_data


In [37]:
statement = """SELECT sample_id, aileron_pos_lh_deg, timestep, label FROM {}.{}
    LIMIT 10""".format(
    database_name, table_name_csv
)

print(statement)

df = pd.read_sql(statement, conn)
df.head(5)

SELECT sample_id, aileron_pos_lh_deg, timestep, label FROM flightdata_db.flight_csv_data
    LIMIT 10


  df = pd.read_sql(statement, conn)


Unnamed: 0,sample_id,aileron_pos_lh_deg,timestep,label
0,14013,84.00256,29,0
1,14013,83.081955,30,0
2,14013,84.57539,31,0
3,14013,88.38058,32,0
4,14013,85.59829,33,0


In [39]:
statement = """SELECT label, COUNT(*) AS count
    FROM {}.{}
    GROUP BY label
    ORDER BY label""".format(
    database_name, table_name_csv
)


df = pd.read_sql(statement, conn)
df.head(10)

  df = pd.read_sql(statement, conn)


Unnamed: 0,label,count
0,0,14346080
1,1,1122080
2,2,353120
3,3,152640


The classes are significantly imbalanced, with 14 million nominal examples, 1 million speed high examples, 350,000 path high examples and 152,000 flaps late setting examples.