In [10]:
import pandas as pd

df=pd.read_csv("ais1-10May2023.csv")
df['time_diff_minutes'][0]=0
df

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['time_diff_minutes'][0]=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_diff_minutes'][0]=0


Unnamed: 0.1,Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,VesselType,Date,Time,time_diff_minutes
0,1223442,209729000,2023-05-01 14:53:41,26.73888,-87.53270,14.1,33.1,70.0,2023-05-01,14:53:41,0.000000
1,1223443,209729000,2023-05-01 15:04:53,26.77623,-87.50645,14.2,32.6,70.0,2023-05-01,15:04:53,11.200000
2,1223444,209729000,2023-05-01 15:08:47,26.78922,-87.49723,14.3,33.5,70.0,2023-05-01,15:08:47,3.900000
3,1223445,209729000,2023-05-01 15:09:53,26.79294,-87.49456,14.3,33.4,70.0,2023-05-01,15:09:53,1.100000
4,1223446,209729000,2023-05-01 15:11:59,26.79996,-87.48955,14.3,33.3,70.0,2023-05-01,15:11:59,2.100000
...,...,...,...,...,...,...,...,...,...,...,...
35619,1914221,636015000,2023-05-09 11:45:17,27.34702,-87.32135,12.9,171.8,80.0,2023-05-09,11:45:17,4.333333
35620,1914222,636015000,2023-05-09 12:26:27,27.20284,-87.28991,12.7,163.7,80.0,2023-05-09,12:26:27,41.166667
35621,1914223,636015000,2023-05-09 12:36:09,27.17000,-87.27943,12.2,168.5,80.0,2023-05-09,12:36:09,9.700000
35622,1914224,636015000,2023-05-09 14:03:27,26.87173,-87.24769,13.0,169.6,80.0,2023-05-09,14:03:27,87.300000


In [32]:
import pandas as pd
import numpy as np
from scipy.interpolate import CubicSpline

# Interpolation function
def interpolate_data(df):
    interpolated_dfs = []
    for mmsi, group in df.groupby('MMSI'):
        group = group.sort_values('BaseDateTime')
        
        # Ensure that BaseDateTime is in datetime format
        group['BaseDateTime'] = pd.to_datetime(group['BaseDateTime'])
        
        # Initialize trajectory ID
        group['trajectory_id'] = (group['time_diff_minutes'] > 300).cumsum()
        
        for traj_id, traj_group in group.groupby('trajectory_id'):
            # Average broadcasting interval in minutes
            avg_broadcast_interval = traj_group['time_diff_minutes'].mean()
            
            interpolated_points = []  # Collect interpolated points here
            
            # Interpolate only between points where time difference is greater than 3 minutes but less than or equal to 300 minutes
            for idx in range(1, len(traj_group)):
                start_time = traj_group['BaseDateTime'].iloc[idx - 1]
                end_time = traj_group['BaseDateTime'].iloc[idx]
                time_diff = traj_group['time_diff_minutes'].iloc[idx]
                
                # Directly add the original points to the result list
                interpolated_points.append(traj_group.iloc[idx - 1].to_dict())
                
                if 3 < time_diff <= 300:
                    num_points = int((end_time - start_time).total_seconds() / 60 / avg_broadcast_interval)
                    
                    if num_points > 1:
                        timestamps = np.linspace(start_time.value, end_time.value, num=num_points)
                        
                        cs_lat = CubicSpline([start_time.value, end_time.value], [traj_group['LAT'].iloc[idx - 1], traj_group['LAT'].iloc[idx]])
                        cs_long = CubicSpline([start_time.value, end_time.value], [traj_group['LON'].iloc[idx - 1], traj_group['LON'].iloc[idx]])
                        
                        new_lats = cs_lat(timestamps)
                        new_longs = cs_long(timestamps)
                        
                        interpolated_df = pd.DataFrame({
                            'MMSI': mmsi,
                            'BaseDateTime': pd.to_datetime(timestamps),
                            'LAT': new_lats,
                            'LON': new_longs,
                            'trajectory_id': traj_id
                        })
                        
                        interpolated_points.extend(interpolated_df.iloc[1:-1].to_dict('records'))  # Avoid duplicates of start and end points
            
            # Add the last point of the trajectory segment
            interpolated_points.append(traj_group.iloc[-1].to_dict())
            
            interpolated_dfs.append(pd.DataFrame(interpolated_points))
    
    result_df = pd.concat(interpolated_dfs, ignore_index=True)
    
    # Ensure the final DataFrame is sorted by BaseDateTime
    result_df = result_df.sort_values(by='BaseDateTime').reset_index(drop=True)
    
    return result_df

# Example usage
# Assuming `df` is your sorted DataFrame with columns ['MMSI', 'BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'VesselType', 'Date', 'Time', 'time_diff_minutes']
# df = pd.read_csv('your_ais_data.csv')
# df['BaseDateTime'] = pd.to_datetime(df['BaseDateTime'])

# Interpolate the data
interpolated_df = interpolate_data(df)

# Display the first few rows of the interpolated DataFrame
print(interpolated_df.head())


   Unnamed: 0       MMSI        BaseDateTime       LAT       LON   SOG    COG  \
0   3818013.0  309761000 2023-05-01 00:00:44  27.70921 -87.28288  10.3  349.5   
1   3818014.0  309761000 2023-05-01 00:02:07  27.71309 -87.28383  10.2  347.6   
2   3818015.0  309761000 2023-05-01 00:07:41  27.72871 -87.28752  10.0  347.9   
3   3818016.0  309761000 2023-05-01 00:12:27  27.74206 -87.29065  10.2  346.8   
4   3818017.0  309761000 2023-05-01 00:15:37  27.75092 -87.29277  10.2  348.5   

   VesselType        Date      Time  time_diff_minutes  trajectory_id  
0        70.0  2023-05-01  00:00:44        -917.766667              0  
1        70.0  2023-05-01  00:02:07           1.383333              0  
2        70.0  2023-05-01  00:07:41           5.566667              0  
3        70.0  2023-05-01  00:12:27           4.766667              0  
4        70.0  2023-05-01  00:15:37           3.166667              0  


In [62]:
interpolated_df.to_csv("interpolated_df.csv")

In [33]:
interpolated_df.shape

(36556, 12)

In [34]:
interpolated_df['MMSI'].nunique()

26

In [38]:
vessel1_ip=interpolated_df[interpolated_df['MMSI']==209729000]
interpolated_df[interpolated_df['MMSI']==209729000]

Unnamed: 0.1,Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,VesselType,Date,Time,time_diff_minutes,trajectory_id
3078,1223442.0,209729000,2023-05-01 14:53:41,26.738880,-87.532700,14.1,33.1,70.0,2023-05-01,14:53:41,0.000000,0
3090,,209729000,2023-05-01 14:55:33,26.745105,-87.528325,,,,,,,0
3099,,209729000,2023-05-01 14:57:25,26.751330,-87.523950,,,,,,,0
3111,,209729000,2023-05-01 14:59:17,26.757555,-87.519575,,,,,,,0
3127,,209729000,2023-05-01 15:01:09,26.763780,-87.515200,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
36550,1171282.0,209729000,2023-05-10 23:54:37,29.759500,-85.916110,12.0,205.7,70.0,2023-05-10,23:54:37,1.016667,3
36551,1171283.0,209729000,2023-05-10 23:55:45,29.756030,-85.917970,12.0,205.3,70.0,2023-05-10,23:55:45,1.133333,3
36552,1171284.0,209729000,2023-05-10 23:56:57,29.752410,-85.919890,11.9,205.4,70.0,2023-05-10,23:56:57,1.200000,3
36553,1171285.0,209729000,2023-05-10 23:58:05,29.749010,-85.921730,12.0,205.2,70.0,2023-05-10,23:58:05,1.133333,3


In [42]:
vessel4=df[df['MMSI']==209729000]

In [64]:
import pandas as pd
import folium
from folium.plugins import PolyLineTextPath

# Function to plot trajectories on a map
def plot_trajectories(df, mmsi):
    # Filter data for the specified MMSI
    vessel_data = df[df['MMSI'] == mmsi]
    
    # Create a base map
    start_coords = [vessel_data['LAT'].iloc[0], vessel_data['LON'].iloc[0]]
    my_map = folium.Map(location=start_coords, zoom_start=10)
    
    # Group data by trajectory_id and plot each trajectory with a different color
    colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 
              'beige', 'darkblue', 'darkgreen', 'cadetblue', 'darkpurple', 'pink',
              'lightblue', 'lightgreen', 'gray', 'black']
    
    for idx, (traj_id, traj_group) in enumerate(vessel_data.groupby('trajectory_id')):
        color = colors[idx % len(colors)]
        
        trajectory = traj_group[['LAT', 'LON']].values.tolist()
        
        # Add polyline for the trajectory
        folium.PolyLine(trajectory, color=color, weight=2.5, opacity=1).add_to(my_map)
        
        # Add filled circles for each point in the trajectory
        for point in trajectory:
            folium.CircleMarker(location=point, radius=3, color=color, fill=True, fill_opacity=0.6).add_to(my_map)
        
        # Add starting marker
        start_point = trajectory[0]
        folium.Marker(start_point, popup=f'Start of Trajectory {traj_id}', icon=folium.Icon(color='green')).add_to(my_map)
        
        # Add ending marker
        end_point = trajectory[-1]
        folium.Marker(end_point, popup=f'End of Trajectory {traj_id}', icon=folium.Icon(color='red')).add_to(my_map)
    
    return my_map

# Example usage
# Assuming `interpolated_df` is your DataFrame with interpolated data
# Replace 'your_mmsi' with the actual MMSI you want to plot
mmsi_to_plot = 209729000  # Replace with the MMSI you want to plot
map_object = plot_trajectories(interpolated_df, mmsi_to_plot)

# Save map to an HTML file or
map_object


In [43]:
import folium

# Create a map centered around the first point
start_location = [vessel4['LAT'].iloc[0], vessel4['LON'].iloc[0]]
m = folium.Map(location=start_location, zoom_start=4)

# Add starting point marker
folium.Marker(location=start_location, popup="Start", icon=folium.Icon(color="green")).add_to(m)

# Add ending point marker
end_location = [vessel4['LAT'].iloc[-1], vessel4['LON'].iloc[-1]]
folium.Marker(location=end_location, popup="End", icon=folium.Icon(color="red")).add_to(m)


# Add circle markers for all points and connect them with a line
points = list(zip(vessel4['LAT'], vessel4['LON']))
for lat, lon in points:
    folium.CircleMarker(
        location=[lat, lon],
        radius=3,
        color="blue",
        fill=True,
        fill_color="blue"
    ).add_to(m)

# Add lines connecting all points
folium.PolyLine(points, color="blue").add_to(m)

# Save the map to an HTML file (uncomment if needed)
# m.save('map.html')

# Display the map in a Jupyter Notebook (if running in one)
m


In [44]:
vessel4[vessel4['time_diff_minutes']>300]

Unnamed: 0.1,Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,VesselType,Date,Time,time_diff_minutes
727,1337095,209729000,2023-05-02 21:07:24,29.82119,-85.77062,13.6,179.4,70.0,2023-05-02,21:07:24,768.283333
1101,1337456,209729000,2023-05-08 01:32:07,27.54388,-86.86763,14.7,25.8,70.0,2023-05-08,01:32:07,6749.433333
1533,1171261,209729000,2023-05-10 23:29:56,29.83261,-85.87926,11.5,202.0,70.0,2023-05-10,23:29:56,9438.5


In [45]:
interpolated_df.columns

Index(['Unnamed: 0', 'MMSI', 'BaseDateTime', 'LAT', 'LON', 'SOG', 'COG',
       'VesselType', 'Date', 'Time', 'time_diff_minutes', 'trajectory_id'],
      dtype='object')

In [56]:
pip install -r requirements.txt --verbose

Using pip 24.0 from /Users/gagandeepmagan/Desktop/ais_cpa/aisenv/lib/python3.10/site-packages/pip (python 3.10)
Collecting numpy==1.21.0 (from -r requirements.txt (line 2))
  Using cached numpy-1.21.0.zip (10.3 MB)
  Installing build dependencies ... [?25l  Running command pip subprocess to install build dependencies
  Collecting packaging==20.5
    Using cached packaging-20.5-py2.py3-none-any.whl.metadata (3.3 kB)
  Collecting setuptools<49.2.0
    Using cached setuptools-49.1.3-py3-none-any.whl.metadata (4.9 kB)
  Collecting wheel==0.36.2
    Using cached wheel-0.36.2-py2.py3-none-any.whl.metadata (2.3 kB)
  Collecting Cython<3.0,>=0.29.21
    Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
  Collecting pyparsing>=2.0.2 (from packaging==20.5)
    Using cached pyparsing-3.1.2-py3-none-any.whl.metadata (5.1 kB)
  Using cached packaging-20.5-py2.py3-none-any.whl (35 kB)
  Using cached wheel-0.36.2-py2.py3-none-any.whl (35 kB)
  Using cached setuptools-49.1.3-py3-none

In [59]:
import pandas as pd
import numpy as np
import hdbscan
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import pdist, squareform
from fastdtw import fastdtw

# Function to compute pairwise DTW distances
def compute_dtw_distance_matrix(trajectories):
    num_trajectories = len(trajectories)
    distance_matrix = np.zeros((num_trajectories, num_trajectories))
    
    for i in range(num_trajectories):
        for j in range(i + 1, num_trajectories):
            distance, _ = fastdtw(trajectories[i], trajectories[j])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance
    
    return distance_matrix

# Function to cluster trajectories
def cluster_trajectories(df, target_clusters=20, min_cluster_size=5):
    # Normalize the latitude and longitude
    scaler = StandardScaler()
    df[['LAT', 'LON']] = scaler.fit_transform(df[['LAT', 'LON']])
    
    # Group by trajectory id and collect latitude and longitude points as lists
    trajectories = df.groupby('trajectory_id')[['LAT', 'LON']].apply(lambda x: list(zip(x.LAT, x.LON))).tolist()
    
    # Compute the distance matrix using DTW
    distance_matrix = compute_dtw_distance_matrix(trajectories)
    
    # Convert distance matrix to condensed form for HDBSCAN
    condensed_distance_matrix = squareform(distance_matrix)
    
    # Initialize HDBSCAN
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='precomputed')
    cluster_labels = clusterer.fit_predict(distance_matrix)
    
    # Assign cluster labels back to the original dataframe
    df['cluster'] = df['trajectory_id'].map(dict(enumerate(cluster_labels)))
    
    return df

# Example usage
# Assuming `interpolated_df` is your DataFrame with interpolated data and a 'trajectory_id' column
# 'trajectory_id' is a unique identifier for each trajectory
clustered_df = cluster_trajectories(interpolated_df)

# Display the first few rows of the clustered DataFrame
print(clustered_df.head())

# Save the clustered data to a CSV file if needed
#clustered_df.to_csv('clustered_trajectories.csv', index=False)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject