## Setup Dependencies

In [3]:
!pip install --disable-pip-version-check -q pip --upgrade > /dev/null
!pip install --disable-pip-version-check -q wrapt --upgrade > /dev/null

### AWS CLI and AWS Python SDK (boto3)

In [4]:
!pip install --disable-pip-version-check -q awscli boto3

### SageMaker

In [5]:
!pip install --disable-pip-version-check -q sagemaker
!pip install --disable-pip-version-check -q smdebug
!pip install --disable-pip-version-check -q sagemaker-experiments

### PyAthena

In [6]:
!pip install --disable-pip-version-check -q PyAthena

### AWS Data Wrangler

In [7]:
!pip install --disable-pip-version-check -q awswrangler

### Zip

In [8]:
!conda install -y zip

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

# All requested packages already installed.



### Matplotlib

In [9]:
!pip install --disable-pip-version-check -q matplotlib

### Seaborn

In [10]:
!pip install --disable-pip-version-check -q seaborn

## Data lake Setup

In [11]:
import boto3
import sagemaker

session = boto3.session.Session()
region = session.region_name
sagemaker_session = sagemaker.Session()
bucket = 'flightfinalapproachanomalydetection'#'sagemaker-us-east-1-817854913925' #sagemaker_session.default_bucket()

s3 = boto3.Session().client(service_name="s3", region_name=region)

print("Bucket name: {}".format(bucket))

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Bucket name: flightfinalapproachanomalydetection


### Verify Bucket Creation

In [12]:
from botocore.client import ClientError

response = None
try:
    response = s3.head_bucket(Bucket=bucket)
    print(response)
    setup_s3_bucket_passed = True
except ClientError as e: 
    print("[ERROR] Cannot find bucket {} in {} due to {}.".format(bucket, response, e))

{'ResponseMetadata': {'RequestId': 'H59G1FBRW43JAHMB', 'HostId': 'GBjj1QDvwW8EebNyx8DvHMMDvhC+JE/L/iLk5lO2biFeuNuKkwa7FZRksPxFPmBYESCc8fu2sXqUtGc51ojRf7yu3VmSckucj8uwl+Dnf9w=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'GBjj1QDvwW8EebNyx8DvHMMDvhC+JE/L/iLk5lO2biFeuNuKkwa7FZRksPxFPmBYESCc8fu2sXqUtGc51ojRf7yu3VmSckucj8uwl+Dnf9w=', 'x-amz-request-id': 'H59G1FBRW43JAHMB', 'date': 'Fri, 31 Jan 2025 02:08:41 GMT', 'x-amz-bucket-region': 'us-east-1', 'x-amz-access-point-alias': 'false', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'BucketRegion': 'us-east-1', 'AccessPointAlias': False}


In [13]:
role = sagemaker.get_execution_role()
region = boto3.Session().region_name 
account_id = boto3.client(service_name="sagemaker", region_name=region)

In [14]:
from IPython.core.display import display, HTML

display(
    HTML(
        '<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/flightfinalapproachanomalydetection/?region={}&tab=overview">S3 Bucket</a></b>'.format(
            region, account_id, region
        )
    )
)

  from IPython.core.display import display, HTML


## Create Athena Database Scheme
### Import PyAthena

In [15]:
from pyathena import connect

In [16]:
table_name_csv = "flight_csv_data"
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

### Create Athena Database

In [17]:
database_name = "flightdata_db"

In [18]:
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [19]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [20]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS flightdata_db


In [21]:
cursor = conn.cursor()
cursor.execute(statement)
print("Database created successfully")


Database created successfully


In [22]:
import pandas as pd

statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

  df_show = pd.read_sql(statement, conn)


Unnamed: 0,database_name
0,assignment2_aws
1,default
2,dsoaws
3,flightdata_db
4,sagemaker_featurestore


## Register CSV with Athena

In [23]:
drop_statement = "DROP TABLE flightdata_db.flight_csv_data"
cursor = conn.cursor()
cursor.execute(drop_statement)
print("Table dropped successfully.")

Table dropped successfully.


In [24]:
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS flightdata_db.flight_csv_data (
        sample_id int,
        timestep int,
        aileron_pos_lh_deg float,
        aileron_pos_rh_deg float,
        corrected_angle_of_attack_deg float,
        baro_correct_alt_lsp_ft float,
        computed_airspeed_lsp_knots float,
        selected_course_deg float,
        drift_angle_deg float,
        elevator_pos_left_deg float,
        te_flap_pos_disc float,
        glideslope_dev_perc float,
        selected_heading_deg float,
        localizer_dev_perc float,
        core_speed_avg_perc float,
        total_pressure_lsp_millibar float,
        pitch_angle_lsp_deg float,
        roll_angle_lsp_deg float,
        rudder_pos_deg float,
        true_heading_lsp_deg float,
        vertical_accel_g float,
        wind_speed_knots float,
        label int
)
STORED AS TEXTFILE
LOCATION 's3://flightfinalapproachanomalydetection/csv/'
TBLPROPERTIES (
    'skip.header.line.count'='1',
    'field.delim'=',',
    'compressionType'='none'
)"""

In [25]:
cursor = conn.cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f9ed6ce2ed0>

In [24]:
import pandas as pd
pd.read_sql(statement, conn)

  pd.read_sql(statement, conn)


In [25]:
statement = "SHOW TABLES IN {}".format(database_name)
cursor = conn.cursor()
cursor.execute(statement)

<pyathena.cursor.Cursor at 0x7f2f4c78f1d0>

In [26]:
if table_name_csv in df_show.values:
    ingest_create_athena_table_csv_passed = True
else:
    ingest_create_athena_table_csv_passed = False
print(ingest_create_athena_table_csv_passed)

False


In [27]:
print(database_name)
print(table_name_csv)

flightdata_db
flight_csv_data


In [28]:
statement = """SELECT sample_id, aileron_pos_lh_deg, timestep, label FROM {}.{}
    LIMIT 10""".format(
    database_name, table_name_csv
)

print(statement)

df = pd.read_sql(statement, conn)
df.head(5)

SELECT sample_id, aileron_pos_lh_deg, timestep, label FROM flightdata_db.flight_csv_data
    LIMIT 10


  df = pd.read_sql(statement, conn)


Unnamed: 0,sample_id,aileron_pos_lh_deg,timestep,label
0,0,81.26119,0,0
1,0,79.604095,1,0
2,0,81.30211,2,0
3,0,82.34547,3,0
4,0,81.87493,4,0


In [29]:
statement = """SELECT label, COUNT(*) AS count
    FROM {}.{}
    GROUP BY label
    ORDER BY label""".format(
    database_name, table_name_csv
)


df = pd.read_sql(statement, conn)
df.head(10)

  df = pd.read_sql(statement, conn)


Unnamed: 0,label,count
0,0,14346080
1,1,1122080
2,2,353120
3,3,152640


The classes are significantly imbalanced, with 14 million nominal examples, 1 million speed high examples, 350,000 path high examples and 152,000 flaps late setting examples. It should be noted that this is time series data, so the number of individual flight recordings is a fraction of this.

### Check for Duplicates

In [33]:
statement = """SELECT *, COUNT(*)
    FROM {}.{}
    GROUP BY sample_id, aileron_pos_lh_deg, timestep, aileron_pos_rh_deg, corrected_angle_of_attack_deg, baro_correct_alt_lsp_ft, computed_airspeed_lsp_knots,
    selected_course_deg, drift_angle_deg, elevator_pos_left_deg, te_flap_pos_disc, glideslope_dev_perc, selected_heading_deg, localizer_dev_perc, core_speed_avg_perc,
    total_pressure_lsp_millibar, pitch_angle_lsp_deg, roll_angle_lsp_deg, rudder_pos_deg, true_heading_lsp_deg, vertical_accel_g, wind_speed_knots, label
    HAVING COUNT(*) > 1""".format(
    database_name, table_name_csv
)

df = pd.read_sql(statement, conn)
df.head(10)


  df = pd.read_sql(statement, conn)


Unnamed: 0,sample_id,timestep,aileron_pos_lh_deg,aileron_pos_rh_deg,corrected_angle_of_attack_deg,baro_correct_alt_lsp_ft,computed_airspeed_lsp_knots,selected_course_deg,drift_angle_deg,elevator_pos_left_deg,...,core_speed_avg_perc,total_pressure_lsp_millibar,pitch_angle_lsp_deg,roll_angle_lsp_deg,rudder_pos_deg,true_heading_lsp_deg,vertical_accel_g,wind_speed_knots,label,_col23


#### Check for Null or Missing Values

In [24]:
!pwd

/home/sagemaker-user/FlightFinalApproachAnomalyDetection


In [26]:
import pandas as pd
df = pd.read_csv("/home/sagemaker-user/FlightFinalApproachAnomalyDetection/training_flight_data.csv")

In [28]:

df.columns = ['sample_id', 'time_step', 'aileron_pos_lh_deg', 'aileron_pos_rh_deg', 'corrected_angle_of_attack_deg', 'baro_correct_alt_lsp_ft', 'computed_airspeed_lsp_knots',
    'selected_course_deg', 'drift_angle_deg', 'elevator_pos_left_deg', 'te_flap_pos_disc', 'glideslope_dev_perc', 'selected_heading_deg', 'localizer_dev_perc', 'core_speed_avg_perc',
    'total_pressure_lsp_millibar', 'pitch_angle_lsp_deg', 'roll_angle_lsp_deg', 'rudder_pos_deg', 'true_heading_lsp_deg', 'vertical_accel_g', 'wind_speed_knots', 'label']

In [29]:
missing_count = df.isnull().sum() 

print(df.isnull().sum())

sample_id                        0
time_step                        0
aileron_pos_lh_deg               0
aileron_pos_rh_deg               0
corrected_angle_of_attack_deg    0
baro_correct_alt_lsp_ft          0
computed_airspeed_lsp_knots      0
selected_course_deg              0
drift_angle_deg                  0
elevator_pos_left_deg            0
te_flap_pos_disc                 0
glideslope_dev_perc              0
selected_heading_deg             0
localizer_dev_perc               0
core_speed_avg_perc              0
total_pressure_lsp_millibar      0
pitch_angle_lsp_deg              0
roll_angle_lsp_deg               0
rudder_pos_deg                   0
true_heading_lsp_deg             0
vertical_accel_g                 0
wind_speed_knots                 0
label                            0
dtype: int64


#### Boxplots of Data Distribution

In [None]:
import matplotlib.pyplot as plt

df.plot(kind='box', subplots=True, layout=(5, 5), figsize=(15, 15))
plt.show()

### Variable Correlation

In [None]:
import seaborn as sns

corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(15, 15))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', )

plt.show()

In [30]:

df["record_identifier"] = df["sample_id"].astype(str) + "_" + df["time_step"].astype(str)

df["time_step"] = df["time_step"].astype(float)
df.head()

Unnamed: 0,sample_id,time_step,aileron_pos_lh_deg,aileron_pos_rh_deg,corrected_angle_of_attack_deg,baro_correct_alt_lsp_ft,computed_airspeed_lsp_knots,selected_course_deg,drift_angle_deg,elevator_pos_left_deg,...,core_speed_avg_perc,total_pressure_lsp_millibar,pitch_angle_lsp_deg,roll_angle_lsp_deg,rudder_pos_deg,true_heading_lsp_deg,vertical_accel_g,wind_speed_knots,label,record_identifier
0,0,0.0,81.26119,82.652336,-8.111792,1969.6174,155.5714,-2.109358,-0.692778,-4.952854,...,70.74118,985.4255,-3.662262,0.785912,-0.390141,-1.08187,0.972379,12.625183,0.0,0_0
1,0,1.0,79.604095,81.0157,-7.644611,1955.6995,154.51205,-2.109358,-0.867216,-5.198349,...,70.71775,985.5203,-3.665276,0.046774,-0.756234,-0.70482,0.770077,11.893839,0.0,0_1
2,0,2.0,81.30211,80.7702,-7.552573,1940.0267,153.32867,-2.109358,-1.424093,-4.830105,...,70.70276,985.4165,-3.940319,0.80482,-1.325632,-0.240446,0.543937,12.559112,0.0,0_2
3,0,3.0,82.34547,83.900276,-8.395265,1924.5493,150.88818,-2.109358,-1.141912,-4.625526,...,70.74159,984.8171,-4.275129,1.077102,-0.326884,-0.191627,1.062817,10.542998,0.0,0_3
4,0,4.0,81.87493,82.75462,-7.854284,1905.367,150.69461,-2.109358,-0.72466,-4.40049,...,70.57045,985.23065,-4.241483,1.654806,0.129545,-0.528425,0.867628,9.713539,0.0,0_4


## Feature Store

In [31]:
from sagemaker.feature_store.feature_group import FeatureGroup
from time import gmtime, strftime, sleep
from sagemaker.session import Session

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

record_identifer_feature_name = 'record_identifier'
event_time_feature_name = 'time_step'

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

flight_feature_group_name = "flightfeature-group-" + strftime("%d-%H-%M-%S", gmtime())
flight_feature_group = FeatureGroup(name=flight_feature_group_name, sagemaker_session=feature_store_session
                                     )

In [32]:
df.head()

Unnamed: 0,sample_id,time_step,aileron_pos_lh_deg,aileron_pos_rh_deg,corrected_angle_of_attack_deg,baro_correct_alt_lsp_ft,computed_airspeed_lsp_knots,selected_course_deg,drift_angle_deg,elevator_pos_left_deg,...,core_speed_avg_perc,total_pressure_lsp_millibar,pitch_angle_lsp_deg,roll_angle_lsp_deg,rudder_pos_deg,true_heading_lsp_deg,vertical_accel_g,wind_speed_knots,label,record_identifier
0,0,0.0,81.26119,82.652336,-8.111792,1969.6174,155.5714,-2.109358,-0.692778,-4.952854,...,70.74118,985.4255,-3.662262,0.785912,-0.390141,-1.08187,0.972379,12.625183,0.0,0_0
1,0,1.0,79.604095,81.0157,-7.644611,1955.6995,154.51205,-2.109358,-0.867216,-5.198349,...,70.71775,985.5203,-3.665276,0.046774,-0.756234,-0.70482,0.770077,11.893839,0.0,0_1
2,0,2.0,81.30211,80.7702,-7.552573,1940.0267,153.32867,-2.109358,-1.424093,-4.830105,...,70.70276,985.4165,-3.940319,0.80482,-1.325632,-0.240446,0.543937,12.559112,0.0,0_2
3,0,3.0,82.34547,83.900276,-8.395265,1924.5493,150.88818,-2.109358,-1.141912,-4.625526,...,70.74159,984.8171,-4.275129,1.077102,-0.326884,-0.191627,1.062817,10.542998,0.0,0_3
4,0,4.0,81.87493,82.75462,-7.854284,1905.367,150.69461,-2.109358,-0.72466,-4.40049,...,70.57045,985.23065,-4.241483,1.654806,0.129545,-0.528425,0.867628,9.713539,0.0,0_4


In [33]:
flight_feature_group.load_feature_definitions(data_frame=df)

[FeatureDefinition(feature_name='sample_id', feature_type=<FeatureTypeEnum.INTEGRAL: 'Integral'>, collection_type=None),
 FeatureDefinition(feature_name='time_step', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='aileron_pos_lh_deg', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='aileron_pos_rh_deg', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='corrected_angle_of_attack_deg', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='baro_correct_alt_lsp_ft', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='computed_airspeed_lsp_knots', feature_type=<FeatureTypeEnum.FRACTIONAL: 'Fractional'>, collection_type=None),
 FeatureDefinition(feature_name='selected_course_deg', feat

### Creation of FeatureGroups in SageMaker FeatureStore

In [34]:
import time

def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


flight_feature_group.create(
    s3_uri=f"s3://{bucket}/feature_group",
    record_identifier_name=record_identifer_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)


wait_for_feature_group_creation_complete(feature_group=flight_feature_group)

Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
Waiting for Feature Group Creation
FeatureGroup flightfeature-group-31-02-10-56 successfully created.


In [35]:
flight_feature_group.describe()

{'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:817854913925:feature-group/flightfeature-group-31-02-10-56',
 'FeatureGroupName': 'flightfeature-group-31-02-10-56',
 'RecordIdentifierFeatureName': 'record_identifier',
 'EventTimeFeatureName': 'time_step',
 'FeatureDefinitions': [{'FeatureName': 'sample_id',
   'FeatureType': 'Integral'},
  {'FeatureName': 'time_step', 'FeatureType': 'Fractional'},
  {'FeatureName': 'aileron_pos_lh_deg', 'FeatureType': 'Fractional'},
  {'FeatureName': 'aileron_pos_rh_deg', 'FeatureType': 'Fractional'},
  {'FeatureName': 'corrected_angle_of_attack_deg',
   'FeatureType': 'Fractional'},
  {'FeatureName': 'baro_correct_alt_lsp_ft', 'FeatureType': 'Fractional'},
  {'FeatureName': 'computed_airspeed_lsp_knots', 'FeatureType': 'Fractional'},
  {'FeatureName': 'selected_course_deg', 'FeatureType': 'Fractional'},
  {'FeatureName': 'drift_angle_deg', 'FeatureType': 'Fractional'},
  {'FeatureName': 'elevator_pos_left_deg', 'FeatureType': 'Fractional'},
  {'Featu

In [36]:
sagemaker_client.list_feature_groups() 

{'FeatureGroupSummaries': [{'FeatureGroupName': 'neighborhood-feature-group-26-05-02-41',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:817854913925:feature-group/neighborhood-feature-group-26-05-02-41',
   'CreationTime': datetime.datetime(2025, 1, 26, 5, 2, 52, 140000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created',
   'OfflineStoreStatus': {'Status': 'Active'}},
  {'FeatureGroupName': 'neighborhood-feature-group-26-04-11-14',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:817854913925:feature-group/neighborhood-feature-group-26-04-11-14',
   'CreationTime': datetime.datetime(2025, 1, 26, 4, 11, 18, 797000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroupName': 'flightfeature-group-31-02-10-56',
   'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:817854913925:feature-group/flightfeature-group-31-02-10-56',
   'CreationTime': datetime.datetime(2025, 1, 31, 2, 10, 57, 185000, tzinfo=tzlocal()),
   'FeatureGroupStatus': 'Created'},
  {'FeatureGroup

In [None]:
flight_feature_group.ingest(data_frame=df, max_workers=5, wait=True)

In [None]:
feats_to_quer = ['0_0','0_150']

for val in feats_to_quer:
    record_identifier_value = val
    
    response = featurestore_runtime.get_record(
        FeatureGroupName=neighborhood_feature_group_name,
        RecordIdentifierValueAsString=record_identifier_value,
    )
    
    if 'Record' in response and response['Record']:
        print("Record",val,"found:")
        for feature in response['Record']:
            print(f"{feature['FeatureName']}: {feature['ValueAsString']}")
        print("\n\n")
    else:
        print("Record",val,"not found.")
        print("\n\n")
