In [30]:
import numpy as np
import pandas as pd
import matplotlib
import plotly.express as px
from ipywidgets import interact, IntSlider
# import csv

### Load csv

In [31]:
df = pd.read_csv("D:\For Interns\\3B\\pdwInterns.csv")

print(df.head())
print("\n")
print(df.info())

        TOA(ns)  PW(microsec)    FREQ(MHz)  Amp_S0(dBm)  AZ_S0(deg)  \
0  2.997516e+06      1.449427  3039.036917   -94.942050    2.673879   
1  3.117654e+06      0.502616  3078.079470   -97.506206    3.178568   
2  3.126066e+06      0.341271  3023.056431   -96.659029    0.754417   
3  3.169086e+06      0.149947  3055.587037  -100.822499   -0.296220   
4  3.212527e+06      0.275051  3085.775965   -89.743904    0.561838   

   EL_S0(deg)  EmitterId  
0   10.408996       3604  
1   11.628477       3602  
2    2.837058       3581  
3    5.670362       3578  
4    8.005784       3592  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1189617 entries, 0 to 1189616
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   TOA(ns)       1189617 non-null  float64
 1   PW(microsec)  1189617 non-null  float64
 2   FREQ(MHz)     1189617 non-null  float64
 3   Amp_S0(dBm)   1189617 non-null  float64
 4   AZ_S0(deg)    1189617 

1. Plot 

In [32]:
column_names = df.columns[[1, 2, 4, 5, 6]]
print(column_names)

# Check and rename emitter ID column if needed
emitter_col = column_names[4]
print(emitter_col)

# Ensure emitter ID is treated as a categorical variable
df[emitter_col] = df[emitter_col].astype(str)



Index(['PW(microsec)', 'FREQ(MHz)', 'AZ_S0(deg)', 'EL_S0(deg)', 'EmitterId'], dtype='object')
EmitterId


1. Plot PDW vs Frequency with color by emitter ID

In [33]:
def filter_df_by_emitters(df, max_emitters=5, emitter_col='EmitterId'):
    """
    Filters the DataFrame to keep only rows with the top `max_emitters` most frequent Emitter IDs.
    
    Args:
        df (pd.DataFrame): The input DataFrame.
        max_emitters (int): Max number of unique emitter IDs to keep.
        emitter_col (str): Name of the column with emitter IDs.
    
    Returns:
        pd.DataFrame: A filtered DataFrame.
    """
    # Get top N most frequent emitters
    top_emitters = df[emitter_col].value_counts().nlargest(max_emitters).index

    # Filter the DataFrame
    filtered_df = df[df[emitter_col].isin(top_emitters)]

    return filtered_df

In [None]:
def plot_pdw_vs_freq(n, max_emitters=5):
    # Filter number of emitters
    subset_df = filter_df_by_emitters(df[:n], max_emitters, emitter_col)
    new_n = len(subset_df)

    fig = px.scatter(
        subset_df,
        x= column_names[0],
        y= column_names[1],
        color=emitter_col,
        title=f"PW vs Frequency (First {new_n} Rows)",
        labels={column_names[0]: 'PW', column_names[1]: 'Frequency'},
        height=500
    )
    fig.show()

# Create interactive sliders
interact(
    plot_pdw_vs_freq,
    n=IntSlider(min=100, max=len(df), step=1000, value=10000),
    max_emitters=IntSlider(min=1, step=1, value=5)
)

interactive(children=(IntSlider(value=10000, description='n', max=1189617, min=100, step=1000), IntSlider(valu…

<function __main__.plot_pdw_vs_freq(n, max_emitters=5)>

Findings:
1. Normally, signals from the same emitter have the same (unchanging) pw and freq.
2. However, some emitter varying pulse width at the same frequecy. In this case all signals with higher pdw (>3 ), have a corresponding signal with lower pdw from the same emitter. But this may not be realistic.

In [None]:

# Plot Azimuth vs Elevation with color by emitter ID
def plot_az_vs_el(n, max_emitters=5):
    # Filter number of emitters
    subset_df = filter_df_by_emitters(df[:n], max_emitters, emitter_col)
    new_n = len(subset_df)

    fig = px.scatter(
        subset_df,
        x=column_names[2],
        y=column_names[3],
        color=emitter_col,
        title=f"Azimuth vs Elevation (First {new_n} Rows)",
        labels={column_names[2]: 'Azimuth', column_names[3]: 'Elevation'},
        height=500
    )
    fig.show()

# Create sliders
interact(plot_az_vs_el, n=IntSlider(min=100, max=len(df), step=1000, value=10000),
         max_emitters=IntSlider(min=1, step=1, value=5)
)

interactive(children=(IntSlider(value=10000, description='n', max=1189617, min=100, step=1000), IntSlider(valu…

<function __main__.plot_az_vs_el(n, max_emitters=5)>

3. Effectiveness of standard clustering

In [36]:
import sklearn as sk
from sklearn.cluster import KMeans, MeanShift, HDBSCAN
import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score

# values will be a n x 1 vector, n ranging from 1 to 4, n being the number of coloumns we want to include in the clustering
# Helper to extract features
def extract_features(df, num_features):
    feature_cols = df.columns[1:num_features+1]  # Assume first few columns are numeric features, skip, TOA
    X = df[feature_cols].values
    X_scaled = StandardScaler().fit_transform(X)
    return X_scaled, feature_cols

# K-Means
def k_means(X, n_clusters):
    model = KMeans(n_clusters=n_clusters, n_init=10)
    labels = model.fit_predict(X)
    return labels

# Mean Shift
def mean_shift(X):
    model = MeanShift()
    labels = model.fit_predict(X)
    return labels

# HDBSCAN
def hdbscan_clustering(X):
    clusterer = HDBSCAN(min_cluster_size=5)
    labels = clusterer.fit_predict(X)
    return labels

# Benchmark performance
def benchmark_performance(df, labels, true_labels):
    ari = adjusted_rand_score(true_labels, labels)
    print(f"Adjusted Rand Index: {ari:.4f}")
    print(f"Clusters found (excluding -1): {len(set(labels) - {-1})}")
    return ari

# Visualisation (works for 2D or 3D only)
def visualise_clusters(X, labels, feature_names):
    dims = X.shape[1]
    if dims == 2:
        fig = px.scatter(x=X[:, 0], y=X[:, 1], color=labels.astype(str),
                         labels={"x": feature_names[0], "y": feature_names[1]},
                         title="Clustered Data (2D)")
    elif dims == 3:
        print("Visualization only supported for 2D, 3D is too much for technet.")
        # fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=labels.astype(str),
        #                     labels={"x": feature_names[0], "y": feature_names[1], "z": feature_names[2]},
        #                     title="Clustered Data (3D)")
    else:
        print("Visualization only supported for 2D, 3D is too much for technet.")
        return
    fig.show()

Actual Benchmarking

In [37]:
subset_df = filter_df_by_emitters(df=df, max_emitters=2)

# Choose number of features to include (1 to 4)
# 1 - PDW 
# 2 - PDW - Freq
# 3 - PDW - AZ
# 4 - PDW - EL
X, features_used = extract_features(subset_df, num_features=2)
true_emitter_ids = subset_df['EmitterId'].astype(str).values
n_true_emitters = len(np.unique(true_emitter_ids))

print("Number of true emitters included: ", n_true_emitters)
print("Number of points included in df: ", len(X))

Number of true emitters included:  2
Number of points included in df:  61820


In [38]:
print("KMeans:")
start = time.time()
kmeans_labels = k_means(X, n_clusters=n_true_emitters)
benchmark_performance(subset_df, kmeans_labels, true_emitter_ids)
visualise_clusters(X, kmeans_labels, features_used)
print("Time:", time.time() - start)


KMeans:
Adjusted Rand Index: 1.0000
Clusters found (excluding -1): 2


Time: 0.3346116542816162


In [39]:
print("\nMean Shift:")
print("Number of true emitters included: ", n_true_emitters)
print("Number of points included in df: ", len(X))

start = time.time()
ms_labels = mean_shift(X)
benchmark_performance(subset_df, ms_labels, true_emitter_ids)
visualise_clusters(X, ms_labels, features_used)
print("Time:", time.time() - start)


Mean Shift:
Number of true emitters included:  2
Number of points included in df:  61820


KeyboardInterrupt: 

In [40]:
print("\nHDBSCAN:")
print("Number of true emitters included: ", n_true_emitters)
print("Number of points included in df: ", len(X))

start = time.time()
hdb_labels = hdbscan_clustering(X)
benchmark_performance(subset_df, hdb_labels, true_emitter_ids)
visualise_clusters(X, hdb_labels, features_used)
print("Time:", time.time() - start)


HDBSCAN:
Number of true emitters included:  2
Number of points included in df:  61820
Adjusted Rand Index: 1.0000
Clusters found (excluding -1): 2


Time: 15.84824824333191


In [41]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

INT_MAX = float('inf')
NEG_INT_MAX = -100

# Signal to Noise Ratio, Higher SNR, clearer the signal
SNR_dB = [INT_MAX, 25, 20, 15, 10, 5, NEG_INT_MAX]  # List of SNRs to simulate
features = ["PW(microsec)", "FREQ(MHz)", 'AZ_S0(deg)', 'EL_S0(deg)']  # Columns to add noise to
emitter_col = "EmitterId"  # You can change this to whatever column name you use
max_emitters = 2  # Max number of emitters to display
n = 100 # Max number of rows to consider

def estimate_snr_db(signal, noise):
    signal_power = np.mean(signal**2)
    noise_power = np.mean(noise**2)
    return 10 * np.log10(signal_power / noise_power)

def add_gaussian_noise(data, snr_db=None, std=None):
    """
    Add Gaussian noise to a dataset.

    Parameters:
    - data: numpy array or pandas DataFrame
    - snr_db: desired Signal-to-Noise Ratio in dB. If set, overrides std.
    - std: standard deviation of the noise. Used only if snr_db is None.

    Returns:
    - noisy_data: data with added Gaussian noise
    """
    data_array = data.to_numpy() if isinstance(data, pd.DataFrame) else np.array(data)
    signal_power = np.mean(data_array ** 2)

    if snr_db is not None:
        snr_linear = 10 ** (snr_db / 10)
        noise_power = signal_power / snr_linear
        std = np.sqrt(noise_power)

    noise = np.random.normal(loc=0.0, scale=std, size=data_array.shape)
    noisy_data = data_array + noise

    if isinstance(data, pd.DataFrame):
        return pd.DataFrame(noisy_data, columns=data.columns)
    return noisy_data

fig = make_subplots(
    rows=len(SNR_dB), cols=2,
    subplot_titles=[f"SNR {snr} dB: PW vs FREQ" if i % 2 == 0 else f"SNR {snr} dB: AZ vs EL" for snr in SNR_dB for i in range(2)],
    horizontal_spacing=0.1,
    vertical_spacing=0.1
)

for idx, snr in enumerate(SNR_dB):
    df_noisy = df.copy()
    df_noisy[features] = add_gaussian_noise(df[features], snr_db=snr)

    filtered_df = filter_df_by_emitters(df_noisy[:n], max_emitters, emitter_col)

    # Left: PW vs Frequency
    for emitter in filtered_df[emitter_col].unique():
        subset = filtered_df[filtered_df[emitter_col] == emitter]
        fig.add_trace(
            go.Scattergl(
                x=subset["PW(microsec)"],
                y=subset["FREQ(MHz)"],
                mode='markers',
                name=f'Emitter {emitter}' if idx == 0 else None,  # Only show legend once
                legendgroup=str(emitter),
                showlegend=(idx == 0)
            ),
            row=idx + 1, col=1
        )

    # Right: AZ vs EL
    for emitter in filtered_df[emitter_col].unique():
        subset = filtered_df[filtered_df[emitter_col] == emitter]
        fig.add_trace(
            go.Scattergl(
                x=subset["AZ_S0(deg)"],
                y=subset["EL_S0(deg)"],
                mode='markers',
                name=f'Emitter {emitter}' if idx == 0 else None,
                legendgroup=str(emitter),
                showlegend=False  # No legend on right side
            ),
            row=idx + 1, col=2
        )

fig.update_layout(
    height=300 * len(SNR_dB),
    width=1000,
    title_text="PW vs FREQ and AZ vs EL across SNR levels",
    showlegend=True
)

fig.show()


In [44]:
import numpy as np
from scipy import stats
import plotly.graph_objects as go

def add_normal_distribution_overlay(fig, x_data, y_data, x_col_name, y_col_name):
    """
    Adds normal distribution contours to a plotly figure based on x and y data.
    
    Args:
        fig: Plotly figure object
        x_data: X-axis data (pandas Series or array)
        y_data: Y-axis data (pandas Series or array)
        x_col_name: Name of x column for labels
        y_col_name: Name of y column for labels
    
    Returns:
        fig: Modified plotly figure with normal distribution overlay
    """
    # Calculate means and standard deviations
    x_mean, x_std = np.mean(x_data), np.std(x_data)
    y_mean, y_std = np.mean(y_data), np.std(y_data)
    
    # Calculate correlation
    correlation = np.corrcoef(x_data, y_data)[0, 1]
    
    # Create grid for contour plot
    x_range = np.linspace(x_mean - 3.5*x_std, x_mean + 3.5*x_std, 100)
    y_range = np.linspace(y_mean - 3.5*y_std, y_mean + 3.5*y_std, 100)
    X, Y = np.meshgrid(x_range, y_range)
    
    # Calculate bivariate normal distribution
    pos = np.dstack((X, Y))
    rv = stats.multivariate_normal([x_mean, y_mean], 
                                   [[x_std**2, correlation*x_std*y_std],
                                    [correlation*x_std*y_std, y_std**2]])
    
    # Calculate probability density values for 1σ, 2σ, 3σ ellipses
    # For bivariate normal, these correspond to specific probability levels
    pdf_values = rv.pdf(pos)
    
    # Calculate contour levels for 1σ, 2σ, 3σ confidence ellipses
    # These are the probability density values at the boundaries
    sigma_levels = []
    sigma_labels = ['1σ', '2σ', '3σ']
    colors = ['red', 'orange', 'yellow']
    
    # Standard deviation levels in terms of Mahalanobis distance
    for i, sigma in enumerate([1, 2, 3]):
        # For 2D normal distribution, contour at k standard deviations
        # corresponds to chi-square with 2 DOF at level k^2
        chi2_val = sigma**2
        # Convert to probability density level
        level = np.exp(-0.5 * chi2_val) / (2 * np.pi * x_std * y_std * np.sqrt(1 - correlation**2))
        sigma_levels.append(level)
    
    # Add contour plot with only specific levels
    fig.add_trace(go.Contour(
        x=x_range,
        y=y_range,
        z=pdf_values,
        showscale=False,
        contours=dict(
            coloring='lines',
            showlabels=True,
            labelfont=dict(size=12, color='black'),
            start=min(sigma_levels),
            end=max(sigma_levels),
            size=(max(sigma_levels) - min(sigma_levels)) / 2,  # Only show our specific levels
        ),
        contours_coloring='lines',
        line=dict(color='red', width=2),
        name='Std Dev Rings',
        opacity=0.8,
        ncontours=3  # Only show 3 contour lines
    ))
    
    # Add individual rings for better control
    for i, (level, label, color) in enumerate(zip(sigma_levels, sigma_labels, colors)):
        fig.add_trace(go.Contour(
            x=x_range,
            y=y_range,
            z=pdf_values,
            showscale=False,
            contours=dict(
                coloring='lines',
                showlabels=True,
                labelfont=dict(size=10, color='black'),
                start=level,
                end=level,
                size=0.001  # Very small size to get exactly one contour
            ),
            line=dict(color=color, width=3),
            name=f'{label} ring',
            opacity=0.9,
            showlegend=True
        ))
    
    # Add mean point with higher visibility
    fig.add_trace(go.Scatter(
        x=[x_mean],
        y=[y_mean],
        mode='markers',
        marker=dict(
            color='black', 
            size=12,
            symbol='x',
            line=dict(color='white', width=2)
        ),
        name=f'Mean ({x_mean:.2f}, {y_mean:.2f})',
        showlegend=True
    ))
    
    # Add statistics as annotation
    stats_text = f"μ_x = {x_mean:.5f}, σ_x = {x_std:.5f}<br>" + \
                 f"μ_y = {y_mean:.5f}, σ_y = {y_std:.5f}<br>" + \
                 f"ρ = {correlation:.3f}"
    
    fig.add_annotation(
        x=0.02, y=0.98,
        xref="paper", yref="paper",
        text=stats_text,
        showarrow=False,
        bgcolor="rgba(255,255,255,0.95)",
        bordercolor="black",
        borderwidth=1,
        font=dict(size=11, color='black')
    )
    
    return fig

# Modified plotting functions
def plot_pdw_vs_freq(n, max_emitters=5):
    # Filter number of emitters
    subset_df = filter_df_by_emitters(df_noisy[:n], max_emitters, emitter_col)
    new_n = len(subset_df)

    fig = px.scatter(
        subset_df,
        x=column_names[0],
        y=column_names[1],
        color=emitter_col,
        title=f"PW vs Frequency (First {new_n} Rows)",
        labels={column_names[0]: 'PW', column_names[1]: 'Frequency'},
        height=500
    )
    
    # Add normal distribution overlay only when max_emitters = 1
    if max_emitters == 1:
        fig = add_normal_distribution_overlay(
            fig, 
            subset_df[column_names[0]], 
            subset_df[column_names[1]],
            column_names[0], 
            column_names[1]
        )
    
    fig.show()

def plot_az_vs_el(n, max_emitters=5):
    # Filter number of emitters
    subset_df = filter_df_by_emitters(df_noisy[:n], max_emitters, emitter_col)
    new_n = len(subset_df)

    fig = px.scatter(
        subset_df,
        x=column_names[2],
        y=column_names[3],
        color=emitter_col,
        title=f"Azimuth vs Elevation (First {new_n} Rows)",
        labels={column_names[2]: 'Azimuth', column_names[3]: 'Elevation'},
        height=500
    )
    
    # Add normal distribution overlay only when max_emitters = 1
    if max_emitters == 1:
        fig = add_normal_distribution_overlay(
            fig, 
            subset_df[column_names[2]], 
            subset_df[column_names[3]],
            column_names[2], 
            column_names[3]
        )
    
    fig.show()

In [None]:
df_noisy = pd.read_csv("./noisy_pdwInterns.csv")

print(df_noisy.head())
print("\n")
print(df_noisy.info())

column_names = df_noisy.columns[[1, 2, 4, 5, 6]]
print(column_names)

# Check and rename emitter ID column if needed
emitter_col = column_names[4]
print(emitter_col)

# Ensure emitter ID is treated as a categorical variable
df_noisy[emitter_col] = df_noisy[emitter_col].astype(str)

# # Create interactive sliders
interact(
    plot_pdw_vs_freq,
    n=IntSlider(min=0, max=len(df_noisy), step=1000, value=10000),
    max_emitters=IntSlider(min=1, step=1, value=5)
)

# Create sliders
interact(plot_az_vs_el, n=IntSlider(min=0, max=len(df_noisy), step=1000, value=10000),
         max_emitters=IntSlider(min=1, step=1, value=5)
)

        TOA(ns)  PW(microsec)    FREQ(MHz)  Amp_S0(dBm)  AZ_S0(deg)  \
0  2.997516e+06      1.449657  3036.342761   -94.942050    2.732830   
1  3.117654e+06      0.502970  3077.048626   -97.506206    3.285462   
2  3.126066e+06      0.341193  3024.423897   -96.659029    0.655522   
3  3.169086e+06      0.150025  3056.222258  -100.822499   -0.192982   
4  3.212527e+06      0.275062  3085.415236   -89.743904    0.461377   

   EL_S0(deg)  EmitterId  
0   10.589218       3604  
1   11.635548       3602  
2    2.699305       3581  
3    5.569267       3578  
4    8.053939       3592  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1189617 entries, 0 to 1189616
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   TOA(ns)       1189617 non-null  float64
 1   PW(microsec)  1189617 non-null  float64
 2   FREQ(MHz)     1189617 non-null  float64
 3   Amp_S0(dBm)   1189617 non-null  float64
 4   AZ_S0(deg)    1189617 

interactive(children=(IntSlider(value=10000, description='n', max=1189617, step=1000), IntSlider(value=5, desc…

interactive(children=(IntSlider(value=10000, description='n', max=1189617, step=1000), IntSlider(value=5, desc…

<function __main__.plot_az_vs_el(n, max_emitters=5)>

# Finding Optimal Min Cluster Size

In [46]:
import glob
import os
import pandas as pd

def find_min_points_per_emitter(df, emitter_column='EmitterID'):
    """
    Find the number of points for each unique emitter and return the minimum.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The dataset containing emitter data
    emitter_column : str, default='EmitterID'
        The column name that contains emitter identifiers
    
    Returns:
    --------
    int
        The minimum number of points any emitter has in the dataset
    """
    if emitter_column not in df.columns:
        raise ValueError(f"Column '{emitter_column}' not found in dataset")
    
    # Count points for each unique emitter
    emitter_counts = df[emitter_column].value_counts()
    
    # Return the minimum count
    min_points = int(emitter_counts.min())
    
    # Optional: Print some statistics for debugging
    print(f"  Unique emitters: {len(emitter_counts)}")
    print(f"  Points per emitter - Min: {min_points}, Max: {emitter_counts.max()}, Mean: {emitter_counts.mean():.1f}")
    
    return min_points

def find_all_optimal_min_cluster_size(data_path, emitter_column='EmitterID'):
    """
    Process all CSV files in batch_data folder and find min cluster size for each.
    
    Parameters:
    -----------
    data_path : str
        Path to the main data file (used to find batch_data folder)
    emitter_column : str, default='EmitterID'
        The column name that contains emitter identifiers
    
    Returns:
    --------
    dict
        Dictionary mapping batch names to their minimum cluster sizes
    """
    # 1) Make sure batches exist
    batch_dir = os.path.join(os.path.dirname(data_path), "batch_data")
    if not os.path.isdir(batch_dir):
        raise FileNotFoundError(f"batch_data folder not found at {batch_dir}")
    
    # 2) Grab every CSV in there
    csv_paths = sorted(glob.glob(os.path.join(batch_dir, "*.csv")))
    if not csv_paths:
        raise FileNotFoundError("No .csv files found in batch_data/")
    
    # Dictionary to store results
    min_cluster_sizes = {}
    
    # 3) Loop over each batch file
    for csv_file in csv_paths:
        # derive a name & sample count
        batch_name = os.path.splitext(os.path.basename(csv_file))[0]
        # count data rows (minus header)
        with open(csv_file, 'r') as f:
            n_samples = sum(1 for _ in f) - 1
        
        print(f"\n=== Processing batch {batch_name} ({n_samples} rows) ===")
        
        # read the CSV file
        df = pd.read_csv(csv_file)
        
        # Find minimum points per emitter for this batch
        try:
            min_points = find_min_points_per_emitter(df, emitter_column)
            min_cluster_sizes[batch_name] = min_points
            print(f"  Min cluster size for {batch_name}: {min_points}")
        except Exception as e:
            print(f"  Error processing {batch_name}: {e}")
            min_cluster_sizes[batch_name] = None
    
    # Print summary
    print(f"\n=== SUMMARY ===")
    print("Min cluster sizes by batch:")
    for batch_name, min_size in min_cluster_sizes.items():
        print(f"  {batch_name}: {min_size}")
    
    return min_cluster_sizes

# Example usage (uncomment and modify as needed):
# Assuming your data_path variable is already defined

data_path = "./"
min_cluster_results = find_all_optimal_min_cluster_size(data_path, 'EmitterId')

print(min_cluster_results)


=== Processing batch Data_Batch_1 (67890 rows) ===
  Unique emitters: 30
  Points per emitter - Min: 43, Max: 4665, Mean: 2263.0
  Min cluster size for Data_Batch_1: 43

=== Processing batch Data_Batch_2 (77531 rows) ===
  Unique emitters: 49
  Points per emitter - Min: 7, Max: 4667, Mean: 1582.3
  Min cluster size for Data_Batch_2: 7

=== Processing batch Data_Batch_3 (122229 rows) ===
  Unique emitters: 77
  Points per emitter - Min: 105, Max: 4667, Mean: 1587.4
  Min cluster size for Data_Batch_3: 105

=== Processing batch Data_Batch_4 (186201 rows) ===
  Unique emitters: 100
  Points per emitter - Min: 433, Max: 4731, Mean: 1862.0
  Min cluster size for Data_Batch_4: 433

=== Processing batch Data_Batch_5 (191370 rows) ===
  Unique emitters: 100
  Points per emitter - Min: 1002, Max: 4729, Mean: 1913.7
  Min cluster size for Data_Batch_5: 1002

=== Processing batch Data_Batch_6 (158983 rows) ===
  Unique emitters: 96
  Points per emitter - Min: 6, Max: 4730, Mean: 1656.1
  Min clu

In [47]:
df = pd.read_csv("D:\For Interns\\3B\\Waypoint.csv")

print(df.head())
print("\n")
print(df.info())

   SpecificEmitterId  RecordNumber  Latitude(deg)  Longitude(deg)  \
0               3713             1         5.0734        103.8373   
1               3714             1         5.5387        104.4141   
2               3715             1         5.5989        104.4855   
3               3716             1         5.2924        103.3649   
4               3717             1         5.1446        103.4198   

   Altitude(ft)  TimeAtWaypoint(s)   
0             0                   0  
1             0                   0  
2             0                   0  
3             0                   0  
4             0                   0  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   SpecificEmitterId   100 non-null    int64  
 1   RecordNumber        100 non-null    int64  
 2   Latitude(deg)       100 non-null    float64
 3   Lo

In [None]:
import pandas as pd

# Load PDW dataset
pdw_df = pd.read_csv(r"D:\For Interns\3B\pdwInterns.csv")

# Load Waypoint dataset
waypoint_df = pd.read_csv(r"D:\For Interns\3B\Waypoint.csv")

# Create a temporary column in pdw_df with the mapped waypoint emitter ID (original + 135)
pdw_df["WaypointEmitterId"] = pdw_df["EmitterId"] + 135

# Prepare waypoint DataFrame
waypoint_df = waypoint_df.rename(columns={"SpecificEmitterId": "WaypointEmitterId"})

# Merge using the adjusted WaypointEmitterId
merged_df = pd.merge(
    pdw_df,
    waypoint_df[["WaypointEmitterId", "Latitude(deg)", "Longitude(deg)"]],
    on="WaypointEmitterId",
    how="left"
)

# Drop the temporary merge key
merged_df = merged_df.drop(columns=["WaypointEmitterId"])

# Reorder columns: insert Latitude/Longitude before EmitterId
cols = merged_df.columns.tolist()
emitter_idx = cols.index("EmitterId")

# Remove lat/lng from current location
cols.remove("Latitude(deg)")
cols.remove("Longitude(deg)")

# Insert lat/lng just before EmitterId
cols.insert(emitter_idx, "Longitude(deg)")
cols.insert(emitter_idx, "Latitude(deg)")

# Apply new column order
merged_df = merged_df[cols]

# Save to new CSV
merged_df.to_csv(r"./pdwInterns_with_latlng.csv", index=False)

print("Merged with offset mapping and saved to pdwInterns_with_latlng.csv")

Merged and saved to pdwInterns_with_latlng.csv


In [51]:
unique_emitters = sorted(pdw_df["EmitterId"].unique())
print(unique_emitters)
print(len(unique_emitters))

[np.int64(3578), np.int64(3579), np.int64(3580), np.int64(3581), np.int64(3582), np.int64(3583), np.int64(3584), np.int64(3585), np.int64(3586), np.int64(3587), np.int64(3588), np.int64(3589), np.int64(3590), np.int64(3591), np.int64(3592), np.int64(3593), np.int64(3594), np.int64(3595), np.int64(3596), np.int64(3597), np.int64(3598), np.int64(3599), np.int64(3600), np.int64(3601), np.int64(3602), np.int64(3603), np.int64(3604), np.int64(3605), np.int64(3606), np.int64(3607), np.int64(3608), np.int64(3609), np.int64(3610), np.int64(3611), np.int64(3612), np.int64(3613), np.int64(3614), np.int64(3615), np.int64(3616), np.int64(3617), np.int64(3618), np.int64(3619), np.int64(3620), np.int64(3621), np.int64(3622), np.int64(3623), np.int64(3624), np.int64(3625), np.int64(3626), np.int64(3627), np.int64(3628), np.int64(3629), np.int64(3630), np.int64(3631), np.int64(3632), np.int64(3633), np.int64(3634), np.int64(3635), np.int64(3636), np.int64(3637), np.int64(3638), np.int64(3639), np.int6