# Machine Learning – TDT4173 Group Project 
### [47] Extreme Machine Learning
##### Alessandro Donadi - 133756
##### Gian Marco Miccio - 133705
##### Giulia Papalini - 133534

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
X_train = pd.read_csv("ais_train.csv", delimiter = '|')
X_test = pd.read_csv("ais_test.csv", delimiter = ',')
ports = pd.read_csv("ports.csv", delimiter = '|')

In [3]:
df=X_train.copy()

In [4]:
df.drop(columns=['cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw'], inplace=True)

In [5]:
ports.rename(columns={
    'latitude': 'latitude_port',
    'longitude': 'longitude_port'
}, inplace=True)
df = pd.merge(df, ports[['latitude_port', 'longitude_port', 'portId']], on='portId', how='left')
print(df.columns)

Index(['time', 'latitude', 'longitude', 'vesselId', 'portId', 'latitude_port',
       'longitude_port'],
      dtype='object')


In [6]:
# CONVERT TIME INTO INT
df["time"]=pd.to_datetime(df.time)
df['time_int']=df['time'].astype('int64') // 10**9
df=df.drop('time', axis=1)


# CONVERT TIME INTO INT
X_test["time"]=pd.to_datetime(X_test.time)
X_test['time_int']=X_test['time'].astype('int64') // 10**9
X_test=X_test.drop('time', axis=1)

In [7]:
df.sort_values(by="time_int").reset_index(drop=True)

Unnamed: 0,latitude,longitude,vesselId,portId,latitude_port,longitude_port,time_int
0,-34.74370,-57.85130,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-33.587500,-71.618889,1704067225
1,8.89440,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,8.967000,-79.533000,1704067236
2,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,39.232500,-76.558889,1704067305
3,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,-34.462500,150.899444,1704067391
4,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,35.783000,-5.817000,1704067431
...,...,...,...,...,...,...,...
1522060,41.33699,2.15130,61e9f3a2b937134a3c4bfdd7,61d37f9c29b60f6113c89e65,41.340278,2.164722,1715126347
1522061,49.71372,-5.22042,61e9f43db937134a3c4c0169,634c4de270937fc01c3a787b,50.083000,-5.317000,1715126348
1522062,38.27895,10.78280,61e9f469b937134a3c4c029b,61d3781293c6feb83e5eb73b,42.098889,11.780833,1715126348
1522063,38.96142,-12.00502,61e9f3aeb937134a3c4bfe43,634c4de270937fc01c3a76a1,38.700000,-9.417000,1715126348


In [8]:
N=np.array([1, 55, 72, 99, 124])

df['lon_prev'] = df.groupby('vesselId')['longitude'].shift(N[0])
df['lat_prev'] = df.groupby('vesselId')['latitude'].shift(N[0])
df['time_int_prev'] = df.groupby('vesselId')['time_int'].shift(N[0])
df['delta_time'] = df.groupby('vesselId')['time_int'].transform(lambda x: x - x.shift(N[0]))
df['lon_prev2'] = df.groupby('vesselId')['longitude'].shift(N[1])
df['lat_prev2'] = df.groupby('vesselId')['latitude'].shift(N[1])
df['time_int_prev2'] = df.groupby('vesselId')['time_int'].shift(N[1])
df['delta_time2'] = df.groupby('vesselId')['time_int'].transform(lambda x: x - x.shift(N[1]))
df['lon_prev3'] = df.groupby('vesselId')['longitude'].shift(N[2])
df['lat_prev3'] = df.groupby('vesselId')['latitude'].shift(N[2])
df['delta_time3'] = df.groupby('vesselId')['time_int'].transform(lambda x: x - x.shift(N[2]))
df['lon_prev4'] = df.groupby('vesselId')['longitude'].shift(N[3])
df['lat_prev4'] = df.groupby('vesselId')['latitude'].shift(N[3])
df['delta_time4'] = df.groupby('vesselId')['time_int'].transform(lambda x: x - x.shift(N[3]))
df['lon_prev5'] = df.groupby('vesselId')['longitude'].shift(N[4])
df['lat_prev5'] = df.groupby('vesselId')['latitude'].shift(N[4])
df['delta_time5'] = df.groupby('vesselId')['time_int'].transform(lambda x: x - x.shift(N[4]))

vessel_encoder = LabelEncoder()


# Fitting e trasformazione delle colonne 'vesselId' e 'portId'
df['vesselId_encoded'] = vessel_encoder.fit_transform(df['vesselId'])
df = df.drop(columns=['vesselId'])
df = df.rename(columns={'vesselId_encoded': 'vesselId'})


X_test['vesselId_encoded'] = vessel_encoder.transform(X_test['vesselId'])
X_test = X_test.drop(columns=['vesselId'])
X_test = X_test.rename(columns={'vesselId_encoded': 'vesselId'})
df.dropna(inplace=True)

In [9]:
# drop the nans for portID
df.dropna(subset=['portId'], inplace=True)

In [10]:
df=df.drop('portId', axis=1)

In [11]:
def cycle(model, X_test, df, N):
    # Predict latitude and longitude

    data = {}
    for vessel_id, group in df.groupby('vesselId'):
        # Each group becomes a list of tuples (lat_prev, lon_prev, time_int)
        data[vessel_id] = list(zip(group['lat_prev'], group['lon_prev'], group['time_int']))

    # Initialize the features in the test dataset with NaN
    X_test['lat_prev'] = np.nan
    X_test['lon_prev'] = np.nan
    X_test['delta_time'] = np.nan
    X_test['lat_prev2'] = np.nan
    X_test['lon_prev2'] = np.nan
    X_test['delta_time2'] = np.nan
    X_test['lat_prev3'] = np.nan
    X_test['lon_prev3'] = np.nan
    X_test['delta_time3'] = np.nan
    X_test['lat_prev4'] = np.nan
    X_test['lon_prev4'] = np.nan
    X_test['delta_time4'] = np.nan
    X_test['lat_prev5'] = np.nan
    X_test['lon_prev5'] = np.nan
    X_test['delta_time5'] = np.nan
    
    
    
    # Dictionary to hold the last known lat/lon for each vessel from the training set
    vessel_last_positions = df[['vesselId', 'latitude', 'longitude', 'time_int']].groupby('vesselId').last().to_dict(orient='index')
    # Lists to store predictions
    predicted_lat = []
    predicted_lon = []

    total_rows = len(X_test)
    increment = total_rows // 10  # Calculate the row count for 10% increments
    # Loop through each row in the sorted X_test
    for i, row in X_test.iterrows():
        if i % increment == 0 and i > 0:
           print(f"Processing: {i / total_rows * 100:.0f}% completed.")

        vessel_id = row['vesselId']

        index_offset2 = len(data[vessel_id]) - N[1]
        index_offset3 = len(data[vessel_id]) - N[2]
        index_offset4 = len(data[vessel_id]) - N[3]
        index_offset5 = len(data[vessel_id]) - N[4]
        
        
        # Initialize prev_lat and prev_lon for this vessel
        row['lat_prev'] = vessel_last_positions[vessel_id]['latitude']
        row['lon_prev'] = vessel_last_positions[vessel_id]['longitude']
        row['delta_time'] = row['time_int'] - vessel_last_positions[vessel_id]['time_int']
        row['lat_prev2'] = data[vessel_id][index_offset2][0]
        row['lon_prev2'] = data[vessel_id][index_offset2][1]
        row['delta_time2'] = row['time_int'] - data[vessel_id][index_offset2][2]
        row['lat_prev3'] = data[vessel_id][index_offset3][0]
        row['lon_prev3'] = data[vessel_id][index_offset3][1]
        row['delta_time3'] = row['time_int'] - data[vessel_id][index_offset3][2]
        row['lat_prev4'] = data[vessel_id][index_offset4][0]
        row['lon_prev4'] = data[vessel_id][index_offset4][1]
        row['delta_time4'] = row['time_int'] - data[vessel_id][index_offset4][2]
        row['lat_prev5'] = data[vessel_id][index_offset5][0]
        row['lon_prev5'] = data[vessel_id][index_offset5][1]
        row['delta_time5'] = row['time_int'] - data[vessel_id][index_offset5][2]
        
        
        # Reorder the row to match the feature order expected by the model
        row_reordered = row[model.feature_names]  # Ensure correct feature order
        row_df = pd.DataFrame([row_reordered], columns=model.feature_names_in_)

        # Predict latitude and longitude
        pred = model.predict(row_df)
        
        # Assuming the model outputs a 2D array, where pred[0][0] is latitude and pred[0][1] is longitude
        predicted_lat.append(pred[0][0])
        predicted_lon.append(pred[0][1])
        
        # Update prev_latitude and prev_longitude in the vessel_last_positions dictionary
        vessel_last_positions[vessel_id] = {'latitude': pred[0][0], 'longitude': pred[0][1], 'time_int': row['time_int']}
        data[vessel_id].append((pred[0][0], pred[0][1], row['time_int']))

    return predicted_lat, predicted_lon




In [12]:
#creating list of features to use (semi-sutomatically)
total_features = df.columns.to_list()
print(total_features)
y_features = ['latitude', 'longitude'] 
features_to_remove_from_x = y_features.copy()
features_to_remove_from_x.extend(['latitude_port','longitude_port', 'time_int', 'time_int_prev', 'time_int_prev2', 'vesselId']) #if others need to be removed add them here
total_features = [feature for feature in total_features if feature not in features_to_remove_from_x]

y = df[y_features]
print(total_features)
X = df[pd.Index(total_features)]

Xtrain=X
ytrain=y


['latitude', 'longitude', 'latitude_port', 'longitude_port', 'time_int', 'lon_prev', 'lat_prev', 'time_int_prev', 'delta_time', 'lon_prev2', 'lat_prev2', 'time_int_prev2', 'delta_time2', 'lon_prev3', 'lat_prev3', 'delta_time3', 'lon_prev4', 'lat_prev4', 'delta_time4', 'lon_prev5', 'lat_prev5', 'delta_time5', 'vesselId']
['lon_prev', 'lat_prev', 'delta_time', 'lon_prev2', 'lat_prev2', 'delta_time2', 'lon_prev3', 'lat_prev3', 'delta_time3', 'lon_prev4', 'lat_prev4', 'delta_time4', 'lon_prev5', 'lat_prev5', 'delta_time5']


In [13]:
def train_model_extratrees(X_train, y_train):
    # Define the Extra Trees model
    et_model = ExtraTreesRegressor(n_jobs =-1, max_depth=80, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=20, random_state = 42, verbose = 0)
    
    et_model.feature_names = list(X_train.columns.values)  # Save feature names
    
    et_model.fit(X_train, y_train)
    
    return et_model

        

 


In [14]:
et_model = train_model_extratrees(Xtrain, ytrain)

In [15]:
lat, lon = cycle(et_model, X_test, df, N)

Processing: 10% completed.
Processing: 20% completed.
Processing: 30% completed.
Processing: 40% completed.
Processing: 50% completed.
Processing: 60% completed.
Processing: 70% completed.
Processing: 80% completed.
Processing: 90% completed.
Processing: 100% completed.


In [16]:
#Load the empty file
predictions_df = pd.read_csv('ais_sample_submission.csv')

#Fill the forecast DataFrame with the latest predictions for longitude and latitude.
predictions_df['longitude_predicted'] = pd.DataFrame(lon, columns=['longitude_predicted'])
predictions_df['latitude_predicted'] = pd.DataFrame(lat, columns=['latitude_predicted'])

#Save the completed file with the forecasts
predictions_df.to_csv('predictions_filled_103.csv',index=False)