### DATASCI 209 Final Project - EDA, Hypothesis 3
Courtney Chen

#### Loading the Data for EDA

In [None]:
# imports
import pandas as pd
import numpy as np
import os
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [None]:
def data_cleaning(filename):
    '''Gets data of interest from CSV file, depending on if it's the ground truth, ALVIRA, ARCUS, DIANA, or VENUS data.'''
    df = pd.read_csv(filename, low_memory=False)
    base = os.path.basename(filename)

    # if ground truth file, get 4 features of interest
    if base.startswith('2020'):
        columns_to_keep = {
            "datetime(utc)": "datetime",
            "latitude": "latitude",
            "longitude": "longitude",
            "altitude(m)": "altitude",
        }
        df = df[list(columns_to_keep.keys())].rename(columns=columns_to_keep)
        df["source_type"] = "truth"

    # if radar system, get 4 features of interest and standardize names
    elif base.startswith(('ALVIRA', 'ARCUS')):
        latitude = [col for col in df.columns if col.endswith("TrackPosition_Latitude")][0]
        longitude = [col for col in df.columns if col.endswith("TrackPosition_Longitude")][0]
        altitude = [col for col in df.columns if col.endswith("TrackPosition_Altitude")][0]

        df.rename(columns={
            "datetime(utc)": "datetime",
            latitude: "latitude",
            longitude: "longitude",
            altitude: "altitude"
        }, inplace=True)

        df = df[["latitude", "longitude", "altitude", "datetime"]]
        df["source_type"] = base.split('_')[0].lower()

    # if DIANA, get 4 features of interest, bearing, range, and classification data
    elif "DIANA" in base.upper():
        columns_to_keep = {
            "datetime(utc)": "datetime",
            "DianaSensorPosition_latitude_deg": "latitude",
            "DianaSensorPosition_longitude_deg": "longitude",
            "DianaSensorPosition_altitude_m": "altitude",
            "DianaTargetsTargetSignal_bearing_deg": "bearing",
            "DianaTargetsTargetSignal_range_m": "range",
            "DianaTargetsTargetClassification_type": "classification",
            "DianaTargetsTargetClassification_score": "score"
        }
        df = df[list(columns_to_keep.keys())].rename(columns=columns_to_keep)
        df["source_type"] = "diana"

    # if VENUS, get VENUS specific features
    elif "VENUS" in base.upper():
        columns_to_keep = {
            "datetime(utc)": "datetime",
            "VenusTrigger_VenusName": "name",
            "VenusTriggerVenusName_isThreat": "is_threat",
            "VenusTrigger_Azimuth": "azimuth",
            "VenusTrigger_Deviation": "deviation",
            "VenusTrigger_OnAirStartTime": "start_time",
            "VenusTrigger_StopTime": "stop_time"
        }
        df = df[list(columns_to_keep.keys())].rename(columns=columns_to_keep)
        df["source_type"] = "venus"

    else:
        print(f"❓ Unknown file type for: {base}")
        df["source_type"] = "unknown"

    df = df.dropna().reset_index(drop=True)
    return df

In [None]:
base_path = "/Users/courtneychen/Desktop/DATASCI209/final project/icmcis-drone-detection (2)/train"

# iterate through folder and store data in separate dfs
for folder_name in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder_name)

    if os.path.isdir(folder_path) and folder_name.startswith("Scenario_"):
        parts = folder_name.split("_")

        scenario_id = "_".join(parts[1:])

        for filename in os.listdir(folder_path):
            if filename.endswith(".csv"):
                file_path = os.path.join(folder_path, filename)

                try:
                    df = data_cleaning(file_path)
                    source_type = df["source_type"].iloc[0]
                    var_name = f"{source_type}_{scenario_id}"
                    globals()[var_name] = df
                except Exception as e:
                    print(f"Failed to process {filename} in {folder_name}: {e}")

#### Visualization 1: Venus Detected vs. True Drone (Confusion "Matrix")

In [None]:
# count as DJI Mavic Pro if it's labelled as any version of that
venus_label = ['DJI Mavic Pro' if 'Mavic Pro' in drone else drone for drone in venus_2_2['name']]
true_label = ["DJI Mavic Pro"] * len(venus_label)
labels = sorted(set(venus_label + true_label))

# create DataFrame of counts
df = pd.DataFrame({
    'true': true_label,
    'predicted': venus_label
})

# count predicted label frequencies
conf_df = df['predicted'].value_counts().reset_index()
conf_df.columns = ['Predicted', 'Count']
conf_df['True'] = 'DJI Mavic Pro'  # constant true label

# create Altair heatmap
chart = alt.Chart(conf_df).mark_rect().encode(
    x=alt.X('Predicted:N', sort='-y', title='Predicted Drone Type', axis=alt.Axis(labelAngle=45)),
    y=alt.Y('True:N', title='True Drone Type'),
    color=alt.Color('Count:Q', scale=alt.Scale(scheme='oranges')),
    tooltip=['Predicted', 'Count']
).properties(
    title='Venus Detected vs. True Drone',
    width=600,
    height=50
)

# text labels
text = alt.Chart(conf_df).mark_text(baseline='middle').encode(
    x=alt.X('Predicted:N', sort='-y'),
    y=alt.Y('True:N'),
    text='Count:Q',
    color=alt.condition(
        alt.datum.Count > conf_df['Count'].max() / 2,
        alt.value('white'),
        alt.value('black')
    )
)

chart + text

#### Visualization 2: Mean Deviation vs. Misclassifications by Detected Drone Type (Scatter Plot)

In [None]:
df = venus_2_2.copy()
df['predicted'] = ['DJI Mavic Pro' if 'Mavic Pro' in name else name for name in df['name']]
df['true'] = 'DJI Mavic Pro'
df['is_misclassified'] = df['predicted'] != df['true']

# create metric_df: group by predicted type
metric_df = df.groupby('predicted').agg(
    misclassifications=('is_misclassified', 'sum'),
    mean_deviation=('deviation', 'mean')
).reset_index().rename(columns={'predicted': 'Predicted'})
metric_df['highlight'] = metric_df['Predicted'] == 'DJI Mavic Pro'

chart = alt.Chart(metric_df).mark_circle().encode(
    x=alt.X('misclassifications:Q', title='Misclassifications'),
    y=alt.Y('mean_deviation:Q', title='Mean Deviation'),
    tooltip=['Predicted', 'misclassifications', 'mean_deviation'],
    color=alt.Color('Predicted:N', title='Detected Drone Type'),
    size=alt.Size('highlight:N', legend=None, scale=alt.Scale(range=[100, 300]))
).properties(
    width=500,
    height=400,
    title='Mean Deviation vs Misclassifications by Detected Drone Type'
)

chart

#### Visualization 3: Drone Path with VENUS Misclassifications (Geographic Projection)

In [8]:
# clean copies and prep
venus = venus_2_2.copy()
truth = truth_2_2.copy()

venus['predicted'] = ['DJI Mavic Pro' if 'Mavic Pro' in name else name for name in venus['name']]
venus['datetime'] = pd.to_datetime(venus['datetime']).dt.round('1s')
truth['datetime'] = pd.to_datetime(truth['datetime']).dt.round('1s')

venus_df = pd.DataFrame({
    'latitude': [51.5192716],
    'longitude': [5.8579155],
    'label': ['VENUS']
})

# join and set classification logic
df = pd.merge(truth, venus[['datetime', 'predicted']], on='datetime', how='left')
df = df.drop_duplicates(subset=['datetime', 'latitude', 'longitude'])
df['predicted'] = df['predicted'].fillna('DJI Mavic Pro')
df['is_correct'] = df['predicted'] == 'DJI Mavic Pro'

# label = 'correct' or actual mislabel
df['status'] = df.apply(
    lambda row: 'Correct' if row['is_correct'] else row['predicted'],
    axis=1
)

# assign unique id to each continuous segment
df['group_id'] = (df['status'] != df['status'].shift()).cumsum()

# define color scale for mislabels only (excluding 'Correct')
color_scale = alt.Scale(
    domain=[
        'DJI OcuSync', 'Hubsan', 'DJI Mavic Mini', 'Cheerson Leopard 2',
        'AscTec Falcon 8 Downlink, DJI Mavic Mini', 'MJX X901', 'Spektrum DSMX downlink'
    ],
    range=['#ff7f0e', '#2ca02c', '#d62728', '#9467bd', "#ff0090", "#00d9ff", "#f2ff00"]
)

# split into correct and mislabeled
df_correct = df[df['status'] == 'Correct']
df_mislabeled = df[df['status'] != 'Correct']

# line for correct predictions (blue, no legend)
correct_path = alt.Chart(df_correct).mark_line(color='#1f77b4').encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    detail='group_id:N',
    tooltip=['datetime:T', 'predicted']
)

# line for misclassified predictions (colored by label, with legend)
mislabeled_path = alt.Chart(df_mislabeled).mark_line().encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    color=alt.Color('status:N', scale=color_scale, title='VENUS Classification'),
    detail='group_id:N',
    tooltip=['datetime:T', 'predicted']
)

# combine the two path layers
colored_path = (correct_path + mislabeled_path).project(
    type='mercator'
).properties(
    width=1000,
    height=1000,
    title='Drone Path with VENUS Misclassifications'
)

# define static point chart for venus
venus_dot = alt.Chart(venus_df).mark_point(
    color='red',
    size=200,
    shape='circle'
).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    tooltip=['label']
)

# add a text label slightly above the dot
venus_label = alt.Chart(venus_df).mark_text(
    align='left',
    baseline='bottom',
    dx=5,
    dy=-5,
    fontWeight='bold',
    color='black'
).encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    text='label:N'
)

# final layered chart
final_chart = colored_path + venus_dot + venus_label
final_chart
