# imports

In [4]:
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point
import copy
import numpy as np
import os
from display_aux import *
# from vessels import VESSELES


In [5]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', None)


# Run definitions

In [6]:

params = {}
# params['input_csv_file_name_full'] = 'C:\\gilad\\work\\tip_and_que\\data\\AIS\\TipandCue_DataSample_CSV\\exactEarth_historical_data_02_2023.csv'
params['input_csv_file_name_full'] = 'debug_data_base.csv'

params['min_date'] = None
params['max_date'] = None
params['columns_list_keep'] = ['Time','MMSI','IMO','Vessel_Name','Ship_Type','Longitude','Latitude','Message_ID','Accuracy','Heading','COG','Fixing_device','Destination_ID','offset1','Offset_2','Offset_3','Offset_4','ATON_type','ATON_name','GNSS_status']
params['filter_vessels_df_dic'] = {
          'max_time_diff[mins]':['<=',30]
          }
params['reload'] = False
params['export_to_excel'] = False


# Functions

## df aux functions

In [7]:
def filter_df(df, filter_dic):
    """
    Filters a DataFrame based on a dictionary of column filters.

    Parameters:
    df (pd.DataFrame): The DataFrame to be filtered.
    filter_dic (dict): A dictionary where keys are column names and values are tuples.
                       Each tuple contains an operator as the first element and the filter value(s) as the second element.
                       Supported operators: '==', '!=', '<', '<=', '>', '>=', 'between'.

    Returns:
    pd.DataFrame: The filtered DataFrame or an empty DataFrame if any column does not exist.

    Example Usage:
    inf_df = pd.DataFrame({
        'MMSI': [123456789, 987654321, 192837465],
        'Vessel_Name': ['Vessel A', 'Vessel B', 'Vessel C'],
        'Latitude': [34.5, 45.6, 56.7],
        'Longitude': [-123.4, -134.5, -145.6]
    })

    filter_dic = {
        'Latitude': ('between', (40.0, 50.0)),  # Applying a 'between' filter for Latitude
        'Longitude': ('<=', -134.5),  # Applying a '<=' filter for Longitude
        'Vessel_Name': ('==', ['Vessel A', 'Vessel C']),  # Applying an '==' filter for Vessel_Name
        'Nonexistent_Column': ('==', 'SomeValue')  # Nonexistent column
    }

    filtered_df = filter_df(inf_df, filter_dic)
    print(filtered_df)
    """
    for column, (operator, value) in filter_dic.items():
        if column not in df.columns:
            print(f"Error: Column '{column}' does not exist in the DataFrame. Existing columns: {list(df.columns)}")
            return pd.DataFrame()  # Return an empty DataFrame
        
        if operator == '==':
            if isinstance(value, list):
                df = df[df[column].isin(value)]
            else:
                df = df[df[column] == value]
        elif operator == '!=':
            df = df[df[column] != value]
        elif operator == '<':
            df = df[df[column] < value]
        elif operator == '<=':
            df = df[df[column] <= value]
        elif operator == '>':
            df = df[df[column] > value]
        elif operator == '>=':
            df = df[df[column] >= value]
        elif operator == 'between':
            if isinstance(value, tuple) and len(value) == 2:
                lower_bound, upper_bound = value
                df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
            else:
                raise ValueError(f"Value for 'between' must be a tuple of two elements: {value}")
            
            if (lower_bound>upper_bound):
                print(f'lower bound ({lower_bound}) is higher than upper bound ({upper_bound})')

        else:
            raise ValueError(f"Unsupported operator: {operator}")
        
    return df


def repeat_single_value_in_column (df,value,column_name,to_print=False):
    if not isinstance(value, list) and not isinstance(value, np.ndarray):
        value = [value]

    # print(value)        
    if (len(value) != 1):
        if (to_print):
            print(f'value is not unique:{value}')
        return pd.DataFrame()
    
    df[column_name] = np.repeat(value,df.shape[0])
    return df


## manage data functions

In [8]:

def get_vessel_data(vessel_data_dic,vessel_MMSI,to_print=False):
    if (1):
    # try:
        vessel_data = vessel_data_dic[vessel_MMSI]
        
        # repeat missing ID values
        ID_columns = ['IMO','Vessel_Name','Ship_Type']
        for ID_column in ID_columns:
            data = vessel_data[ID_column].loc[vessel_data[ID_column].notna()]
            data = data.unique()
            if (isinstance(data,str)):
                data = data.strip()
            # print(vessel_data.shape)
            vessel_data = repeat_single_value_in_column(vessel_data,data,ID_column)
            # print(vessel_data.shape)
            
            if (vessel_data.empty):
                if (to_print is True):
                    print(f'failed to create data base for vessel_MMSI={vessel_MMSI}')

                return vessel_data


        # take only the lines where there is a Longitude
        # print(vessel_data.shape)

        vessel_data = vessel_data[vessel_data['Longitude'].notna()]

# handle exponent represntations 
        vessel_data.loc[:, 'Latitude'] = vessel_data['Latitude'].apply(convert_to_float)
        vessel_data.loc[:, 'Longitude'] = vessel_data['Longitude'].apply(convert_to_float)
        
        # print(vessel_data.shape)
 
        # sort data by time
        vessel_data = vessel_data.sort_values(by='Time')
        # print('sucess')
    # except:
    #     vessel_data = pd.DataFrame()
    #     # sys.exit(1)   
    return vessel_data



# vessel_MMSI = vessel_data_info['single'][100]
# vesel_data = get_vessel_data(vessel_data_dic,vessel_MMSI)

# vessel_data.head()


def get_vessel_data_stats(vessel_data):
    stats_dic = {
        'len': [vessel_data.shape[0]],  # Scalar value wrapped in a list
        'min_time':get_min_max_dates(vessel_data)[0],
        'max_time':get_min_max_dates(vessel_data)[1],
        'total_time':max(vessel_data['Time'])- min(vessel_data['Time']),
        'min_time_diff[mins]': round(np.min(time_diff_convert(vessel_data['Time'].diff()))),
        'max_time_diff[mins]': round(np.max(time_diff_convert(vessel_data['Time'].diff()))),
        'mean_time_diff[mins]': round(np.mean(time_diff_convert(vessel_data['Time'].diff()))),
        'min_Longitude':(min(vessel_data['Longitude'])),
        'max_Longitude':(max(vessel_data['Longitude'])),
        'min_Latitude':(min(vessel_data['Latitude'])),
        'max_Latitude':(max(vessel_data['Latitude'])),
    }
    stats_dic['span_Longitude']  = stats_dic['max_Longitude']-stats_dic['min_Longitude']
    stats_dic['span_Latitude']  = stats_dic['max_Latitude']-stats_dic['min_Latitude']

        # 'diff_Latitude':max(vessel_data['Latitude'])-min(vessel_data['Latitude'])


    return stats_dic






In [9]:
# def create_vessels_df(vessel_data_dic,MMSI_list,min_data_len_thresh = 2):
#     inf_df = pd.DataFrame()
#     vessel_MMSI_prob = []

#     for i,vessel_MMSI in enumerate(MMSI_list):
#         if (i%1000==0):
#             print(f'proccessing MMSI {i} out of {len(vessel_data_info_list)}')
#         vessel_data = get_vessel_data(vessel_data_dic,vessel_MMSI)
#         if (vessel_data.shape[0]<min_data_len_thresh):
#             vessel_MMSI_prob.append(vessel_MMSI)
#         else:
#             vessels_df_line = pd.DataFrame(get_vessel_data_stats(vessel_data), index=[vessel_MMSI])  # Providing index explicitly
#             inf_df = pd.concat([inf_df,vessels_df_line])

#     inf_df = inf_df.sort_values(by='len',ascending=False)
#     print(inf_df)
#     return vessles_df


## File handling functions


In [10]:
def load_or_create_df(csv_file_path, save_path,reload = False):
    if os.path.exists(save_path) and reload==False:
        print(f"Loading DataFrame from {save_path}")
        df = pd.read_pickle(save_path)
    else:
        print(f"Reading CSV file from {csv_file_path}")
        df = pd.read_csv(csv_file_path, low_memory=False)
        print(f"Saving DataFrame to {save_path}")
        df.to_pickle(save_path)
    return df

In [11]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

def save_vessel_data_to_geojson(vessel_data, file_path = './data', file_name=None):
    """
    Save latitude and longitude data from a DataFrame to a GeoJSON file.
    
    Parameters:
    - vessel_data: pandas DataFrame containing 'Latitude' and 'Longitude' columns.
    - file_path: Directory path where the GeoJSON file will be saved.
    - file_name: Name of the GeoJSON file (without the .geojson extension).
    """

    if (file_name is None):
        file_name = vessel_data['Vessel_Name'].iloc[0]

    file_name = file_name.rstrip()

    # Create a geometry column with Point objects
    geometry = [Point(lon, lat) for lon, lat in zip(vessel_data['Longitude'], vessel_data['Latitude'])]

    # Create a GeoDataFrame
    gdf = gpd.GeoDataFrame(vessel_data, geometry=geometry, crs='EPSG:4326')  # Assuming WGS84 projection

    # Save to GeoJSON file
    file_name_geojson = f"{file_path}/{file_name}.geojson"
    gdf.to_file(file_name_geojson, driver='GeoJSON')


def save_vessels_data_to_geojson(vessels_df_info,vessel_data_dic,file_path):
    for i in range(inf_df.shape[0]):
        try:
            if (i % 10==0):
                print(f'saving {i} files out of {inf_df.shape[0]}')
            vessel_data = get_vessel_data(vessel_data_dic,inf_df.index[i])
            save_vessel_data_to_geojson(vessel_data,file_path)
        except:
            print(f'could not export MMSI={inf_df.index[i]} to jason')
    print(f'saved {i} files in {file_path}')
    return   



## Time functions

In [12]:
import pandas as pd

def filter_df_by_date(df, min_date, max_date, time_column='Time', date_format='%Y-%m-%d %H:%M:%S'):
    """
    Function to filter a DataFrame based on a time column and specified date range.
    
    Parameters:
    
    
    - df (pd.DataFrame): The input DataFrame.
    - min_date (str): The minimum date as a string.
    - max_date (str): The maximum date as a string.
    - time_column (str): The name of the column containing time data in the specified format.
    - date_format (str): The format of the date strings in the time column and min_date, max_date.
    
    Returns:
    - filtered_df (pd.DataFrame): The DataFrame filtered by the specified date range.
    """
    # Convert the Time column to datetime
    df[time_column] = pd.to_datetime(df[time_column], format=date_format)

    if (min_date is None):
        min_date = min(df[time_column])

    if (max_date is None):
        max_date = max(df[time_column])


    
    # Convert min_date and max_date to datetime
    min_date = pd.to_datetime(min_date, format=date_format)
    max_date = pd.to_datetime(max_date, format=date_format)
    
    # Filter the DataFrame based on the date range
    filtered_df = df[(df[time_column] >= min_date) & (df[time_column] <= max_date)]
    
    return filtered_df


# Define the minimum and maximum dates
# min_date = '2023-02-01 00:00:01'
# max_date = '2023-02-02 00:00:01'

# # Filter the DataFrame based on the date range
# df = filter_df_by_date(df, min_date, max_date)

# get_min_max_dates(df)



def time_diff_convert(time_diff,units='mins',to_round=True):
    if (not isinstance(time_diff,pd.core.series.Series)):
        is_series = False
        time_diff = pd.Series(time_diff)
    else:
        is_series = True

    if (units == 'secs'):        
        time_diff_mod = time_diff.apply(lambda x: x.total_seconds()) 
    
    if (units == 'mins'):        
        time_diff_mod = time_diff.apply(lambda x: x.total_seconds() / 60) 

    if (units == 'hours'):        
        time_diff_mod = time_diff.apply(lambda x: x.total_seconds() / 3600) 

    if (to_round):
        time_diff_mod = round(time_diff_mod)

    if (not is_series):
        time_diff_mod = time_diff_mod.values[0]        
    return time_diff_mod
    


    
def convert_time_format(df, time_column, current_format, output_format):
    """
    Function to convert the time format of a specified column in a DataFrame.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - time_column (str): The name of the column containing time data.
    - current_format (str): The current format of the time data in the column.
    - output_format (str): The desired output format for the tim data.
    
    Returns:
    - df (pd.DataFrame): The DataFrame with the time column converted to the desired format.
    """
    # Convert the Time column to datetime using the current format
    df[time_column] = pd.to_datetime(df[time_column], format=current_format)
    
    # Convert the datetime to the desired output format
    df[time_column] = df[time_column].dt.strftime(output_format)
    
    return df




def get_min_max_dates(df, time_column='Time',input_format = '%Y-%m-%d %H:%M:%S',output_format='%Y-%m-%d %H:%M:%S'):
    """
    Function to get the minimum and maximum dates from a DataFrame's time column.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - time_column (str): The name of the column containing time data in '%Y-%m-%d %H:%M:%S' format.
    - output_format (str): The desired output datetime format.
    
    Returns:
    - min_date (str): The minimum date in the desired format.
    - max_date (str): The maximum date in the desired format.
    """
    # # Convert the Time column to datetime
    # df[time_column] = pd.to_datetime(df[time_column], format=input_format)
    
    # Get the minimum and maximum dates
    min_date = df[time_column].min().strftime(output_format)
    max_date = df[time_column].max().strftime(output_format)
    
    return min_date, max_date



# # Get the minimum and maximum dates in the desired format
# min_date, max_date = get_min_max_dates(df)

# print("Min date:", min_date)
# print("Max date:", max_date)


## Plot Functions

In [13]:
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib.ticker import MaxNLocator
def plot_df_column_vs_time(df,column_name,time_column_name='Time'):

    # Sample data
    dates = df[time_column_name].dt.strftime('%Y-%m-%d %H:%M:%S').to_list()
    values = df[column_name]

    # Create a figure with a larger size
    plt.figure(figsize=(7, 4))

    # Create a line plot
    plt.plot(dates, values)

    # Rotate x-tick labels by 45 degrees and change their font size
    plt.xticks(rotation=45, fontsize=12, ha='right')


    # Use MaxNLocator to reduce the number of ticks
    ax = plt.gca()
    ax.xaxis.set_major_locator(MaxNLocator(nbins=5))  # Adjust the number of bins as needed

    # Add labels and title
    plt.xlabel('Date and Time')
    plt.ylabel('Values')
    plt.title('Plot with Rotated and Formatted x-tick Labels')

    # Add grid
    plt.grid(True)

    # Adjust layout to prevent clipping of tick-labels
    plt.tight_layout()

    # Optionally adjust the subplots manually
    plt.subplots_adjust(bottom=0.2)

    # Show the plot
    plt.show()



## data converters

In [14]:
def convert_to_float(s):
    """
    Convert a string representation of a number in exponential format to a float.
    
    Parameters:
    - s: String containing the number in exponential format.
    
    Returns:
    - Float representation of the number.
    """
    try:
        # Check if the string contains a decimal point in the exponent part
        if 'E+' in s or 'E-' in s or 'e+' in s or 'e-' in s:
            parts = s.split('E') if 'E' in s else s.split('e')
            
            # Extract the base number and exponent
            base = float(parts[0])
            exponent = float(parts[1])
            
            # Adjust exponent if it contains a decimal point
            if '.' in parts[1]:
                exponent = int(float(parts[1]))  # Convert to int to remove decimal part
            
            # Calculate the final float value
            result = base * (10 ** exponent)
            
            return result
        
        # If no 'E' or 'e' found, convert directly to float
        return float(s)
    
    except ValueError as e:
        print(f"Error converting '{s}' to float: {e}")
        return None




In [15]:
# class FILE_AUX:
    # def 

# df_aux class

In [16]:
class DF_AUX:

    def __init__(self):
        k = 1

    def export_df(self,df, out_file_name, columns=None, start_line=0, num_lines=None):
        """
        Exports a subset of a DataFrame to an Excel file.

        Parameters:
        df (pd.DataFrame): The input DataFrame.
        out_file_name (str): The name of the output Excel file.
        columns (list): List of columns to include in the export.
        start_line (int): The starting line (index) from which to export.
        num_lines (int): The number of lines (rows) to export.

        Returns:
        None
        """

        if (columns==None):
            columns = df.columns

        if (num_lines==None):
            num_lines = df.shape[0]
        # Select the desired subset of the DataFrame
        subset_df = df[columns].iloc[start_line:start_line+num_lines-1]
        print(subset_df.shape)
        
        # Export the subset to an Excel file

        file_name, file_extension = os.path.splitext(out_file_name)

        print(f'exporting {num_lines} lines from df to {out_file_name}')

        if (file_extension=='.xlsx'):
            subset_df.to_excel(out_file_name, index=False)
            
        elif (file_extension=='.csv'):
            subset_df.to_csv(out_file_name, index=False)




df_aux = DF_AUX()

# display functions

In [17]:
def print_dict(dict):
    for key in dict.keys():
        print(f'{key}:{dict[key]}')

# vessels class

In [18]:
class VESSELES:
    # Class attribute
    vehicle_count = 0

    # Initializer / Instance attributes
    def __init__(self):
        self.data_dic = []  # It should be self.kuku to be an instance attribute
        self.info_df = []
        self.prob_MMSI = [];

    def load_data(self, input_csv_file_name_full, columns_list_keep, min_date=None, max_date=None, reload=False):
        # Convert the Time column from 'YYYYMMDD_HHMMSS' to 'YYYY-MM-DD HH:MM:SS'
        pkl_file_name_full = input_csv_file_name_full.replace(".csv", ".pkl")

        df = load_or_create_df(input_csv_file_name_full, pkl_file_name_full, reload=reload)

# in case the format has already changed
        try:
            df = convert_time_format(df, 'Time', '%Y%m%d_%H%M%S', '%Y-%m-%d %H:%M:%S')
        except:
            pass

        df = filter_df_by_date(df, min_date, max_date)

        # Get a list of interesting columns
        df = df[columns_list_keep]

        print(f'df has {df.shape[0]} lines./ncolumns are:{df.columns.to_list()}')
        return df

    def create_data_dic(self,df):
        print('creating data_dic')
        grouped = df.groupby('MMSI')

        # Create a dictionary to store each vessel's data
        self.data_dic = {MMSI: group for MMSI, group in grouped}
        return (self.data_dic)





    def create_info_df(self,data_dic=None, min_data_len_thresh=2,to_print = True,num_lines = None):
        print('create info_df')

        info_df = pd.DataFrame()
        prob_MMSI = []
        MMSI_list = list(self.data_dic.keys())

        if (num_lines != None):
            MMSI_list = MMSI_list[:num_lines]

            
        if (data_dic==None):
            data_dic = self.data_dic


        for i, vessel_MMSI in enumerate(MMSI_list):
            if (i % 1000 == 0):
                print(f'processing MMSI {i} out of {len(MMSI_list)}')
            vessel_data = get_vessel_data(data_dic, vessel_MMSI)  # Assuming get_vessel_data is defined elsewhere
            
            if (vessel_data.shape[0] < min_data_len_thresh):
                prob_MMSI.append(vessel_MMSI)
            else:
                vessels_df_line = pd.DataFrame(get_vessel_data_stats(vessel_data), index=[vessel_MMSI])
                info_df = pd.concat([info_df, vessels_df_line])

        info_df = info_df.sort_values(by='len', ascending=False)

        if (to_print):
            print (f"total number of MMSI:{len(MMSI_list)}")
            print (f"{info_df.shape[0]} MMSI's passed")
            print (f"{len(prob_MMSI)} MMSI's failed")


        self.prob_MMSI = prob_MMSI
        self.info_df=info_df

        return info_df,prob_MMSI  # Corrected return statement


    def get_info_df_summary(self):
        info_df_summary = {}

        for column in (vessels_info_df.columns):
            info_df_summary[column] = (self.info_df[column].min(),self.info_df[column].max())

        print_dict(info_df_summary)

        return 


    def get_vessel_data(self,vessel_MMSI,vessel_data_dic = None,to_print=False):
        
        if (vessel_data_dic==None):
            vessel_data_dic = self.data_dic

        if (1):
        # try:
            vessel_data = vessel_data_dic[vessel_MMSI]
            print(vessel_data.shape)
            # repeat missing ID values
            ID_columns = ['IMO','Vessel_Name','Ship_Type']
            for ID_column in ID_columns:
                data = vessel_data[ID_column].loc[vessel_data[ID_column].notna()]
                data = data.unique()
                if (isinstance(data,str)):
                    data = data.strip()
                # print(vessel_data.shape)
                vessel_data = repeat_single_value_in_column(vessel_data,data,ID_column)
                # print(vessel_data.shape)
                
                if (vessel_data.empty):
                    if (to_print is True):
                        print(f'failed to create data base for vessel_MMSI={vessel_MMSI}')

                    return vessel_data


            # take only the lines where there is a Longitude
            # print(vessel_data.shape)

            vessel_data = vessel_data[vessel_data['Longitude'].notna()]

    # handle exponent represntations 
            vessel_data.loc[:, 'Latitude'] = vessel_data['Latitude'].apply(convert_to_float)
            vessel_data.loc[:, 'Longitude'] = vessel_data['Longitude'].apply(convert_to_float)
            
            # print(vessel_data.shape)
    
            # sort data by time
            vessel_data = vessel_data.sort_values(by='Time')
            # print('sucess')
        # except:
        #     vessel_data = pd.DataFrame()
        #     # sys.exit(1)   
        return vessel_data


vessels = VESSELES()


# init vessles class

In [19]:
# Create an instance of the VESSELES class
vessels = VESSELES()


# Read the file

In [20]:
df = vessels.load_data(params['input_csv_file_name_full'],columns_list_keep=params['columns_list_keep'],reload=params['reload'])


NameError: name 'load_or_create_df' is not defined

## create debug data set

In [None]:
# get_min_max_dates(df)
# filter_dic = {'Time':['between',('2023-02-01 00:00:01','2023-02-01 12:00:01')]}

# df_filt = filter_df(df,filter_dic)


# get_min_max_dates(df_filt)
# print(df_filt.shape)
# df_aux.export_df(df_filt,'debug_data_base.csv') 
# df1 = vessels.load_data('debug_data_base.csv',columns_list_keep=params['columns_list_keep'],reload=True)


# Export to exell

In [None]:
if (params['export_to_excel']):
    df_aux.export_df(df, os.path.basename(params['input_csv_file_name_full'].replace('csv','xlsx')),num_lines=10000)


# create data_dic and info_df

In [None]:
vesseles_data_dic = vessels.create_data_dic(df)        
vessels_info_df,prob_MMSI = vessels.create_info_df(num_lines=100)
vessels.get_info_df_summary()

creating data_dic
create info_df
processing MMSI 0 out of 100
total number of MMSI:100
28 MMSI's passed
72 MMSI's failed
len:(8, 131)
min_time:('2023-02-01 00:00:11', '2023-02-01 06:04:40')
max_time:('2023-02-01 06:21:11', '2023-02-01 11:59:53')
total_time:(Timedelta('0 days 04:06:01'), Timedelta('0 days 11:54:32'))
min_time_diff[mins]:(0, 12)
max_time_diff[mins]:(10, 187)
mean_time_diff[mins]:(5, 35)
min_Longitude:(25.7783333333, 55.5016666667)
max_Longitude:(25.7796666667, 55.50321)
min_Latitude:(23.9133333333, 36.4483333333)
max_Latitude:(23.94, 36.4495)
span_Longitude:(4.166659999782496e-05, 2.3704833333999957)
span_Latitude:(6.49999999993156e-05, 1.2321999999999989)


In [None]:
dict = {'A':1}
print_dict(dict)

A:1


In [None]:
vessels_info_df

Unnamed: 0,len,min_time,max_time,total_time,min_time_diff[mins],max_time_diff[mins],mean_time_diff[mins],min_Longitude,max_Longitude,min_Latitude,max_Latitude,span_Longitude,span_Latitude
12345678,131,2023-02-01 00:05:31,2023-02-01 11:59:52,0 days 11:54:21,0,10,5,55.26525,55.265338,25.26201,25.262133,8.8e-05,0.000123
209489000,126,2023-02-01 00:05:19,2023-02-01 11:50:09,0 days 11:44:50,0,16,6,25.778333,25.779667,36.448333,36.4495,0.001333,0.001167
209700000,121,2023-02-01 00:03:36,2023-02-01 11:57:09,0 days 11:53:33,0,24,6,55.007097,55.327507,25.025,25.438465,0.32041,0.413465
210282000,90,2023-02-01 00:03:48,2023-02-01 11:52:58,0 days 11:49:10,0,30,8,33.01,33.183148,34.646667,34.688695,0.173148,0.042028
209492000,89,2023-02-01 00:05:59,2023-02-01 11:57:27,0 days 11:51:28,0,15,8,33.316667,33.31847,34.716667,34.718615,0.001803,0.001948
205231000,86,2023-02-01 00:08:20,2023-02-01 11:53:23,0 days 11:45:03,0,14,8,50.653333,50.653482,26.211667,26.212947,0.000148,0.00128
209815000,83,2023-02-01 00:04:20,2023-02-01 11:58:17,0 days 11:53:57,1,11,9,33.01331,33.01343,34.644947,34.645023,0.00012,7.7e-05
209511000,75,2023-02-01 02:02:00,2023-02-01 11:53:56,0 days 09:51:56,0,12,8,33.019828,33.730132,33.947252,34.657088,0.710303,0.709837
209343000,73,2023-02-01 00:00:33,2023-02-01 11:53:23,0 days 11:52:50,3,17,10,33.009005,33.00905,34.645058,34.645123,4.5e-05,6.5e-05
209773000,71,2023-02-01 00:00:11,2023-02-01 11:54:43,0 days 11:54:32,0,84,10,38.201667,38.248467,23.913333,23.94,0.0468,0.026667


# get a vessel data

In [None]:
# vessel_data = get_vessel_data(vessels.data_dic,422)
vessel_data = vessels.get_vessel_data(vessels_info_df.index[2])
vessel_data


(177, 20)


Unnamed: 0,Time,MMSI,IMO,Vessel_Name,Ship_Type,Longitude,Latitude,Message_ID,Accuracy,Heading,COG,Fixing_device,Destination_ID,offset1,Offset_2,Offset_3,Offset_4,ATON_type,ATON_name,GNSS_status
284911,2023-02-01 00:03:36,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.068333,25.025,27,1.0,,21.0,,,,,,,,,0.0
9012,2023-02-01 00:23:38,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.069923,25.026493,3,1.0,258.0,128.9,,,,,,,,,
142104,2023-02-01 00:24:31,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.068333,25.025,27,1.0,,93.0,,,,,,,,,0.0
26086,2023-02-01 00:35:39,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.069907,25.026482,3,1.0,259.0,,,,,,,,,,
288114,2023-02-01 00:42:35,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.068333,25.025,27,1.0,,6.0,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143619,2023-02-01 11:44:58,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.287997,25.432198,1,1.0,81.0,80.0,,,,,,,,,
73938,2023-02-01 11:45:09,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.288593,25.432292,3,1.0,81.0,79.8,,,,,,,,,
109270,2023-02-01 11:51:29,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.309043,25.43556,11,1.0,,,15.0,,,,,,,,
143628,2023-02-01 11:55:09,209700000,9134139.0,DUBAI ALLIANCE,70.0,55.320997,25.437443,1,1.0,80.0,80.3,,,,,,,,,


## plot some statistis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_vessels_df_statistics(df, columns, bins=10):
    """
    Plots histograms of specified columns in a DataFrame as subplots.

    df (pd.DataFrame): The DataFrame containing the data.
    Parameters:
    columns (list): List of column names to plot.
    bins (int): Number of bins for the histograms.
    """
    num_columns = len(columns)
    num_rows = (num_columns + 1) // 2  # Calculate number of rows needed for subplots

    fig, axes = plt.subplots(num_rows, 2, figsize=(12, num_rows * 4))
    axes = axes.flatten()  # Flatten the axes array to easily iterate over it

    for i, column in enumerate(columns):
        try:
            if column in df.columns:
                axes[i].hist(df[column], bins=bins, edgecolor='black')
                axes[i].set_title(f'Histogram of {column}')
                axes[i].set_xlabel(column)
                axes[i].set_ylabel('Frequency')
            else:
                axes[i].text(0.5, 0.5, f'Column {column} not found', ha='center', va='center')
                axes[i].set_title(f'Histogram of {column}')
                axes[i].set_xlabel(column)
                axes[i].set_ylabel('Frequency')
        except:
            print(f'could not plot {column}')
            
    # Remove any unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()
    

columns_to_plot = ['field1', 'field2', 'field3']

plot_vessels_df_statistics(vessels_df, ['len','min_time_diff[mins]'], bins=100)


NameError: name 'vessels_df' is not defined

## filtering the vessles Df

In [None]:
filter_dic = {
        'len':['between',(700,800)],
        'min_Longitude': ('between', (40.0, 50.0)),        
      #   'mean_time_diff[mins]':['==',13],
        'min_time':['between',('2023-02-01 00:00:00','2023-02-01 00:01:39')]
              }

# filter_dic = {
#           'max_time_diff[mins]':['<=',30]
#           }

vessels_df_filt = filter_df(vessels_info_df, filter_dic)
# df_filt.shape
vessels_df_filt

# Save the vessel_data to a jason file

 

In [None]:
vessel_data_jason_file_path = '.\\data'

In [None]:

save_vessels_data_to_geojson(vessels_df_filt,vessel_data_dic,vessel_data_jason_file_path)



# plots of vessel data

In [None]:

vessel_data = get_vessel_data(vessel_data_dic,MMSI)
plot_df_column_vs_time(vessel_data,column_name='Longitude')


In [None]:
grouped = df.groupby('MMSI')

# Create a dictionary to store each vessel's data
vessel_data_dic = {vessel_MMSI: group for vessel_MMSI, group in grouped}
