In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler
import datetime
from tensorflow.keras import backend as K
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from keras.optimizers import SGD
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor



In [2]:
def list_subdirectories(directory_path):
    subdirectories =  [subdir for subdir in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, subdir))]
    return subdirectories

train_doc_list = list_subdirectories('train')

test_doc_list = list_subdirectories('test')


total_gnss_df = pd.DataFrame({})
total_imu_df = pd.DataFrame({})
total_gt_df = pd.DataFrame({})

def utc_to_unix_millis(utc_millis_array):
    # Unix epoch start time in milliseconds
    unix_epoch_start = np.datetime64('1970-01-01T00:00:00', 'ms').astype('datetime64[ms]').astype(int)
    # Convert UTC time array to Unix time array
    unix_millis_array = np.array(utc_millis_array) - unix_epoch_start
    return unix_millis_array

def unix_to_utc_millis(unix_millis_array):
    # Unix epoch start time in milliseconds
    unix_epoch_start = np.datetime64('1970-01-01T00:00:00', 'ms').astype('datetime64[ms]').astype(int)
    # Convert Unix time array to UTC time array
    utc_millis_array = np.array(unix_millis_array) + unix_epoch_start
    return utc_millis_array

scaler = StandardScaler()


Models:

RNN

In [3]:
# Model architecture
model_rnn = Sequential()
model_rnn.add(SimpleRNN(units=128, return_sequences=True, input_shape=(17,1 )))  
model_rnn.add(SimpleRNN(units=64, return_sequences=True, activation='sigmoid'))
model_rnn.add(SimpleRNN(units=32,activation='tanh'))  
model_rnn.add(Dense(2))

# Compile the model
optimizer = SGD(learning_rate=0.001)
model_rnn.compile(loss='mse', optimizer=optimizer)

CNN-GRU

In [3]:
# Define the combined CNN-GRU model
def create_cnn_gru_model(input_shape, l2_reg=0.01):
    model = tf.keras.Sequential([
        # CNN layers
        tf.keras.layers.Reshape((-1, 1), input_shape=input_shape),  # Reshape for Conv1D input
        tf.keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_reg)),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_reg)),
        tf.keras.layers.MaxPooling1D(pool_size=2),
        tf.keras.layers.Flatten(),
        
        # GRU layer
        tf.keras.layers.Reshape((-1, 64)),  # Reshape for GRU input
        tf.keras.layers.GRU(48),
        
        # Dense output layer
        tf.keras.layers.Dense(2)  # Output layer for predicting 2 new features
    ])
    return model

# Define the model
input_shape = (None, 21)  # Adjust according to your data dimensions (21 features)
model_cnn_gru = create_cnn_gru_model(input_shape)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # Set the learning rate to 0.001

# Compile the model with the customized optimizer
model_cnn_gru.compile(optimizer=optimizer, loss='mae')


# Function to train the model on a single file with gradient clipping
def train_model_on_file(model, x_train, y_train, clip_norm=1.0):
    model.fit(x_train, y_train, epochs=10, batch_size=16)  # Train the model


Linear Regression (LR)

In [5]:
# Create a linear regression model
model_lr = LinearRegression()

Random Forest

In [6]:
model_rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=150, random_state=42))

Support Vector Machine (SVM)

In [7]:
from sklearn.svm import SVR

In [8]:
# Multi-output SVR model creating
svr = SVR(kernel='poly', degree=5, C=1.0, epsilon=0.1)  
model_svm = MultiOutputRegressor(svr)

Gradient Boosting Machines (GBM)

In [50]:
gbr = GradientBoostingRegressor(n_estimators=80, learning_rate=0.01, max_depth=5, random_state=42)
model_gbm = MultiOutputRegressor(gbr)

K-Nearest Neighbourhood (KNN)

In [57]:
# Creating multi-output KNN model
knn = KNeighborsRegressor(n_neighbors=6, weights = 'distance')
model_knn = MultiOutputRegressor(knn)

## TRAIN

Reading the trainin data and preprocessing:

In [6]:
counter = 0
for main_doc_name in train_doc_list:
    sub_sub_list = list_subdirectories('train'+'/'+main_doc_name)
    
    for doc_name_list in sub_sub_list:
        
        #GNSS Data Part:

        #Reading the present gnss csv file
        df_gnss = pd.read_csv('train'+'/'+main_doc_name+'/'+doc_name_list+'/'+'device_gnss.csv', low_memory=False)

        data_gnss = pd.DataFrame()
        #Taking only the necessary columns
        data_gnss['utcTimeMillis'] =  df_gnss["utcTimeMillis"]
        data_gnss['SvVelocityYEcefMetersPerSecond'] = df_gnss["SvVelocityYEcefMetersPerSecond"]
        data_gnss['SvVelocityZEcefMetersPerSecond'] = df_gnss['SvVelocityZEcefMetersPerSecond']
        data_gnss['SvVelocityXEcefMetersPerSecond'] = df_gnss['SvVelocityXEcefMetersPerSecond']
        data_gnss['RawPseudorangeMeters'] = df_gnss['RawPseudorangeMeters']
        data_gnss['SvClockBiasMeters'] = df_gnss['SvClockBiasMeters']
        data_gnss['IsrbMeters'] = df_gnss['IsrbMeters']
        data_gnss['TroposphericDelayMeters'] = df_gnss['TroposphericDelayMeters']
        data_gnss['IonosphericDelayMeters'] = df_gnss['IonosphericDelayMeters']
        data_gnss['WlsPositionXEcefMeters'] = df_gnss['WlsPositionXEcefMeters']
        data_gnss['WlsPositionYEcefMeters'] = df_gnss['WlsPositionYEcefMeters']
        data_gnss['WlsPositionZEcefMeters'] = df_gnss['WlsPositionZEcefMeters']
               

        #If more than one measurement is taken in the same second, average the measurements taken for the same time period and create one line of data for each second
        ten_digit_gnss = data_gnss.copy()
        ten_digit_gnss['utcTimeMillis'] = data_gnss['utcTimeMillis'].astype(str).str[:10].astype(int)
        gnss_data = ten_digit_gnss.groupby('utcTimeMillis').mean().reset_index()
        gnss_data = gnss_data.fillna(method='ffill')


        #IMU Data Part:
        #Reading the present imu csv file
        df_imu = pd.read_csv('train'+'/'+main_doc_name+'/'+doc_name_list+'/'+'device_imu.csv', low_memory=False)

        # Transforming it to Pandas DataFrame to be able to make moves easily
        data_imu = pd.DataFrame()

        #Taking only the necessary columns
        data_imu['MessageType'] =  df_imu["MessageType"]
        data_imu['utcTimeMillis'] =  df_imu["utcTimeMillis"]
        data_imu['MeasurementX'] =  df_imu["MeasurementX"]
        data_imu['MeasurementY'] = df_imu["MeasurementY"]
        data_imu['MeasurementZ'] = df_imu['MeasurementZ']
        
        #Since the accelerometer, gyro and magnetometer are combined in the same sensor in the IMU, it can be seen that the data received from these sensors 
        #are stored in a mixed manner. To fix this and evaluate each sensor data on its own, three sensor data are transferred to three different dataframes
        accel_imu_data = data_imu.loc[data_imu['MessageType'] == 'UncalAccel']
        mag_imu_data = data_imu.loc[data_imu['MessageType'] == 'UncalMag']
        gyro_imu_data = data_imu.loc[data_imu['MessageType'] == 'UncalGyro']
        
        #For the next step empty DataFrames are created
        ten_digit_accel = accel_imu_data.copy()
        ten_digit_mag = mag_imu_data.copy()
        ten_digit_gyro = gyro_imu_data.copy()

        #Data is currently processed and stored with millisecond precision, removing the last three digits to convert it to seconds precision
        ten_digit_accel['utcTimeMillis'] = accel_imu_data['utcTimeMillis'].astype(str).str[:10].astype(int)
        ten_digit_mag['utcTimeMillis'] = mag_imu_data['utcTimeMillis'].astype(str).str[:10].astype(int)
        ten_digit_gyro['utcTimeMillis'] = gyro_imu_data['utcTimeMillis'].astype(str).str[:10].astype(int)
        
        #If more than one measurement is taken in the same second, average the measurements taken for the same time period and create one line of data for each second
        imu_accel_df = ten_digit_accel.groupby('utcTimeMillis').mean().reset_index()
        imu_mag_df = ten_digit_mag.groupby('utcTimeMillis').mean().reset_index()
        imu_gyro_df = ten_digit_gyro.groupby('utcTimeMillis').mean().reset_index()

        imu_accel_df = imu_accel_df.fillna(method='ffill')
        imu_mag_df = imu_mag_df.fillna(method='ffill')
        imu_gyro_df = imu_gyro_df.fillna(method='ffill')


        #Since measurements for each sensor are recorded with the same column name, column names are customized for each sensor type to avoid confusion when combining data
        imu_accel_df = imu_accel_df.rename(columns={'MeasurementX': 'Accel_MeasurementX'})
        imu_accel_df = imu_accel_df.rename(columns={'MeasurementY': 'Accel_MeasurementY'})
        imu_accel_df = imu_accel_df.rename(columns={'MeasurementZ': 'Accel_MeasurementZ'})
        imu_mag_df = imu_mag_df.rename(columns={'MeasurementX': 'Mag_MeasurementX'})
        imu_mag_df = imu_mag_df.rename(columns={'MeasurementY': 'Mag_MeasurementY'})
        imu_mag_df = imu_mag_df.rename(columns={'MeasurementZ': 'Mag_MeasurementZ'})
        imu_gyro_df = imu_gyro_df.rename(columns={'MeasurementX': 'Gyro_MeasurementX'})
        imu_gyro_df = imu_gyro_df.rename(columns={'MeasurementY': 'Gyro_MeasurementY'})
        imu_gyro_df = imu_gyro_df.rename(columns={'MeasurementZ': 'Gyro_MeasurementZ'})

        #Recombining the processed imu data. Here, the same rows are determined by time, rows with the same time value are combined and the columns of 
        #accel, mag, and gyro measurements are stored separately. 
        total_imu_df = pd.merge(imu_accel_df, imu_mag_df, on='utcTimeMillis', how='inner')
        total_imu_df = pd.merge(total_imu_df, imu_gyro_df, on='utcTimeMillis', how='inner')

        total_imu_df = total_imu_df.fillna(method='ffill')
        
        #Finally, gnss and imu data are paired to take time into account and a single data is created. Thus, the 'X' part of the 
        #data set that will be used later when training the model is created
        x_part = pd.merge(gnss_data, total_imu_df, on='utcTimeMillis', how='inner')
        x_part = x_part.fillna(method='ffill')

        
        #Ground_Truth Data Part:
        df_truth = pd.read_csv('train'+'/'+main_doc_name+'/'+doc_name_list+'/'+'ground_truth.csv')
        data_gt = {'UnixTimeMillis': df_truth["UnixTimeMillis"],
                'LatitudeDegrees': df_truth["LatitudeDegrees"],
                'LongitudeDegrees':df_truth["LongitudeDegrees"],
                'AltitudeMeters':df_truth['AltitudeMeters']
                }
        data_gt = pd.DataFrame(data_gt)
        
        #The time values are arranged according to the data that will be used as 'X' data when training the model. The values UnixTimeMillis column are arranged as to be 10 
        #digit numbers. 
        ten_digit_gt = data_gt.copy()
        ten_digit_gt['UnixTimeMillis'] = unix_to_utc_millis(ten_digit_gt['UnixTimeMillis'] )
        ten_digit_gt['UnixTimeMillis'] = data_gt['UnixTimeMillis'].astype(str).str[:10].astype(int)

        #The column names are set to be the same with the previous data
        ten_digit_gt = ten_digit_gt.rename(columns={'UnixTimeMillis': 'utcTimeMillis'})
        
        #The y part of the training data for training the model
        y_part = ten_digit_gt
                
        #Finally matching the 'x', and the 'y' parts of the data set according to time.
        merged_training_data = pd.merge(x_part, y_part, on='utcTimeMillis', how='inner')
        merged_training_data = merged_training_data.fillna(method='ffill')
                
        #saving the file
        file_path = main_doc_name
        merged_training_data.to_csv(f'train_data/{file_path}_{counter}.csv', index=False)
        counter = counter + 1
        print('counter:', counter)

counter: 1
counter: 2
counter: 3
counter: 4
counter: 5
counter: 6
counter: 7
counter: 8
counter: 9
counter: 10
counter: 11
counter: 12
counter: 13
counter: 14
counter: 15
counter: 16
counter: 17
counter: 18
counter: 19
counter: 20
counter: 21
counter: 22
counter: 23
counter: 24
counter: 25
counter: 26
counter: 27
counter: 28
counter: 29
counter: 30
counter: 31
counter: 32
counter: 33
counter: 34
counter: 35
counter: 36
counter: 37
counter: 38
counter: 39
counter: 40
counter: 41
counter: 42
counter: 43
counter: 44
counter: 45
counter: 46
counter: 47
counter: 48
counter: 49
counter: 50
counter: 51
counter: 52
counter: 53
counter: 54
counter: 55
counter: 56
counter: 57
counter: 58
counter: 59
counter: 60
counter: 61
counter: 62
counter: 63
counter: 64
counter: 65
counter: 66
counter: 67
counter: 68
counter: 69
counter: 70
counter: 71
counter: 72
counter: 73
counter: 74
counter: 75
counter: 76
counter: 77
counter: 78
counter: 79
counter: 80
counter: 81
counter: 82
counter: 83
counter: 84
c

In [3]:
# Directory containing the CSV files
folder_path = 'train_data'

# List all files in the directory
file_names = os.listdir(folder_path)

# Filter CSV files
csv_files = [file for file in file_names if file.endswith('.csv')]


- Train RNN model:

In [38]:
for doc_name in csv_files:
    #Reading the csv files
    data = pd.read_csv('train_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_train_wt_rnn = data.copy() 
    y_data_to_train_rnn = data.copy()

    x_data_to_train_wt_rnn = x_data_to_train_wt_rnn.drop(columns=['utcTimeMillis','LatitudeDegrees','LongitudeDegrees','AltitudeMeters','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ'])
    #print('x_data_to_train_wt', x_data_to_train_wt)
    y_data_to_train_rnn = y_data_to_train_rnn.drop(columns=['utcTimeMillis','SvVelocityYEcefMetersPerSecond','SvVelocityZEcefMetersPerSecond','SvVelocityXEcefMetersPerSecond',
                                                    'RawPseudorangeMeters','SvClockBiasMeters','IsrbMeters','TroposphericDelayMeters','IonosphericDelayMeters',
                                                    'WlsPositionXEcefMeters','WlsPositionYEcefMeters','WlsPositionZEcefMeters','Accel_MeasurementX','Accel_MeasurementY',
                                                    'Accel_MeasurementZ','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ','Gyro_MeasurementX','Gyro_MeasurementY',
                                                    'Gyro_MeasurementZ','AltitudeMeters'])
    #print('y_data_to_train',y_data_to_train)
    #Train the RNN model
    los=model_rnn.train_on_batch(x_data_to_train_wt_rnn, y_data_to_train_rnn)
    print(los)

8177.9482421875
7540.939453125
7078.5791015625
6673.61865234375
6267.8515625
5887.1376953125
5518.388671875
5179.44775390625
4846.26123046875
4533.8662109375
4228.4873046875
3954.07177734375
3698.1171875
3439.248046875
3216.1865234375
3007.465087890625
2812.27490234375
2629.7373046875
2482.174560546875
2321.064697265625
2170.398681640625
2029.520751953125
1897.790283203125
1774.5887451171875
1646.9561767578125
1540.04443359375
1440.049560546875
1346.5677490234375
1256.9632568359375
1175.3709716796875
1111.3712158203125
1039.22900390625
971.7642211914062
908.7311401367188
847.719970703125
792.6885986328125
741.2554931640625
681.9183959960938
637.6531982421875
596.26123046875
562.9334716796875
526.3915405273438
489.88873291015625
458.19012451171875
439.49200439453125
410.9648132324219
384.2987060546875
356.2351379394531
333.1123962402344
311.4902038574219
291.2737121582031
271.2371826171875
253.6396026611328
237.17608642578125
215.84744262695312
203.21327209472656
190.0224151611328
177.6

- Train LR model:

In [5]:
# NaN and infinity value problem solving
def check_and_clean_data(df):
    # Checking NAN values
    if df.isna().sum().sum() > 0:
        print("NaN değerler bulundu ve temizlendi.")
        df = df.fillna(method='ffill').fillna(method='bfill')  # filling the NaN values 

    # Checking infinity values
    if np.isinf(df.values).sum() > 0:
        print("Sonsuz değerler bulundu ve temizlendi.")
        df = df.replace([np.inf, -np.inf], np.nan).fillna(method='ffill').fillna(method='bfill')

    # Checking too big or small values
    max_val = np.finfo(np.float64).max
    min_val = np.finfo(np.float64).min
    if ((df.values > max_val).sum() > 0) or ((df.values < min_val).sum() > 0):
        print("Çok büyük veya çok küçük değerler bulundu ve temizlendi.")
        df = df.clip(lower=min_val, upper=max_val)

    return df


In [46]:
for doc_name in csv_files:
    #Reading the csv files
    data = pd.read_csv('train_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_train_wt_lr = data.copy() 
    y_data_to_train_lr = data.copy()

    x_data_to_train_wt_lr = x_data_to_train_wt_lr.drop(columns=['utcTimeMillis','LatitudeDegrees','LongitudeDegrees','AltitudeMeters','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ'])
    #print('x_data_to_train_wt', x_data_to_train_wt)
    y_data_to_train_lr = y_data_to_train_lr.drop(columns=['utcTimeMillis','SvVelocityYEcefMetersPerSecond','SvVelocityZEcefMetersPerSecond','SvVelocityXEcefMetersPerSecond',
                                                    'RawPseudorangeMeters','SvClockBiasMeters','IsrbMeters','TroposphericDelayMeters','IonosphericDelayMeters',
                                                    'WlsPositionXEcefMeters','WlsPositionYEcefMeters','WlsPositionZEcefMeters','Accel_MeasurementX','Accel_MeasurementY',
                                                    'Accel_MeasurementZ','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ','Gyro_MeasurementX','Gyro_MeasurementY',
                                                    'Gyro_MeasurementZ','AltitudeMeters'])
    #print('y_data_to_train',y_data_to_train_lr)
    x_data_to_train_wt_lr = check_and_clean_data(x_data_to_train_wt_lr)
    y_data_to_train_lr = check_and_clean_data(y_data_to_train_lr)
    #Train the LR model
    model_lr.fit(x_data_to_train_wt_lr, y_data_to_train_lr)
    y_train_pred = model_lr.predict(x_data_to_train_wt_lr)
    mse = mean_squared_error(y_data_to_train_lr, y_train_pred)
    print('training loss:', mse)
    

training loss: 2.2648445115422844e-10
training loss: 1.4874359290610522e-10
training loss: 1.2952709961436858e-09
training loss: 1.186900011175411e-09
training loss: 1.6946424401023714e-09
training loss: 2.369670378300152e-09
training loss: 2.7613491613168076e-09
training loss: 1.075735712017452e-09
training loss: 1.0008205151561183e-09
training loss: 2.614712060979713e-09
training loss: 1.941656545282127e-09
training loss: 1.5836270421597461e-09
training loss: 2.611867988138278e-09
training loss: 5.381994572979005e-09
training loss: 4.409298185320805e-10
training loss: 9.734733396241256e-10
training loss: 1.3833284288576936e-09
training loss: 2.3096411111285065e-09
training loss: 1.625683275262255e-10
training loss: 1.3501702456320376e-10
training loss: 1.2781008265836073e-10
training loss: 1.2848696721311636e-10
training loss: 1.9909050135389884e-10
training loss: 2.036808106585673e-10
training loss: 1.859704832914402e-10
training loss: 2.8661480086091974e-10
training loss: 1.6684550

- Train CNN-GRU model:

In [6]:
for doc_name in csv_files:
    #Reading the csv files
    data = pd.read_csv('train_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_train_cnn_gru = data.copy() 
    y_data_to_train_cnn_gru = data.copy()

    x_data_to_train_wt_cnn_gru = x_data_to_train_cnn_gru.drop(columns=['utcTimeMillis','LatitudeDegrees','LongitudeDegrees','AltitudeMeters','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ'])
    y_data_to_train_cnn_gru = y_data_to_train_cnn_gru.drop(columns=['utcTimeMillis','SvVelocityYEcefMetersPerSecond','SvVelocityZEcefMetersPerSecond','SvVelocityXEcefMetersPerSecond',
                                                    'RawPseudorangeMeters','SvClockBiasMeters','IsrbMeters','TroposphericDelayMeters','IonosphericDelayMeters',
                                                    'WlsPositionXEcefMeters','WlsPositionYEcefMeters','WlsPositionZEcefMeters','Accel_MeasurementX','Accel_MeasurementY',
                                                    'Accel_MeasurementZ','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ','Gyro_MeasurementX','Gyro_MeasurementY',
                                                    'Gyro_MeasurementZ','AltitudeMeters'])
    x_data_to_train_wt_cnn_gru = check_and_clean_data(x_data_to_train_wt_cnn_gru)
    y_data_to_train_cnn_gru = check_and_clean_data(y_data_to_train_cnn_gru)
    # Train the CNN-GRU MODEL:
    train_model_on_file(model_cnn_gru, x_data_to_train_wt_cnn_gru, y_data_to_train_cnn_gru)
    

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

- Train Random Forest

In [13]:
for doc_name in csv_files:
    #Reading the csv files
    data = pd.read_csv('train_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_train_rf = data.copy() 
    y_data_to_train_rf = data.copy()

    x_data_to_train_wt_rf = x_data_to_train_rf.drop(columns=['utcTimeMillis','LatitudeDegrees','LongitudeDegrees','AltitudeMeters','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ'])
    y_data_to_train_rf = y_data_to_train_rf.drop(columns=['utcTimeMillis','SvVelocityYEcefMetersPerSecond','SvVelocityZEcefMetersPerSecond','SvVelocityXEcefMetersPerSecond',
                                                    'RawPseudorangeMeters','SvClockBiasMeters','IsrbMeters','TroposphericDelayMeters','IonosphericDelayMeters',
                                                    'WlsPositionXEcefMeters','WlsPositionYEcefMeters','WlsPositionZEcefMeters','Accel_MeasurementX','Accel_MeasurementY',
                                                    'Accel_MeasurementZ','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ','Gyro_MeasurementX','Gyro_MeasurementY',
                                                    'Gyro_MeasurementZ','AltitudeMeters'])
    x_data_to_train_wt_rf = check_and_clean_data(x_data_to_train_wt_rf)
    y_data_to_train_rf= check_and_clean_data(y_data_to_train_rf)
    # Train the DT MODEL:
    model_rf.fit( x_data_to_train_wt_rf, y_data_to_train_rf)

NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.


- Train SVM

In [14]:
for doc_name in csv_files:
    #Reading the csv files
    data = pd.read_csv('train_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_train_svm = data.copy() 
    y_data_to_train_svm = data.copy()

    x_data_to_train_wt_svm = x_data_to_train_svm.drop(columns=['utcTimeMillis','LatitudeDegrees','LongitudeDegrees','AltitudeMeters','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ'])
    y_data_to_train_svm = y_data_to_train_svm.drop(columns=['utcTimeMillis','SvVelocityYEcefMetersPerSecond','SvVelocityZEcefMetersPerSecond','SvVelocityXEcefMetersPerSecond',
                                                    'RawPseudorangeMeters','SvClockBiasMeters','IsrbMeters','TroposphericDelayMeters','IonosphericDelayMeters',
                                                    'WlsPositionXEcefMeters','WlsPositionYEcefMeters','WlsPositionZEcefMeters','Accel_MeasurementX','Accel_MeasurementY',
                                                    'Accel_MeasurementZ','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ','Gyro_MeasurementX','Gyro_MeasurementY',
                                                    'Gyro_MeasurementZ','AltitudeMeters'])
    x_data_to_train_wt_svm = check_and_clean_data(x_data_to_train_wt_svm)
    y_data_to_train_svm = check_and_clean_data(y_data_to_train_svm)
    # Train the SVM MODEL:
    model_svm.fit( x_data_to_train_wt_svm, y_data_to_train_svm)

NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.


- Train GBM

In [51]:
for doc_name in csv_files:
    #Reading the csv files
    data = pd.read_csv('train_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_train_gbm = data.copy() 
    y_data_to_train_gbm = data.copy()

    x_data_to_train_wt_gbm = x_data_to_train_gbm.drop(columns=['utcTimeMillis','LatitudeDegrees','LongitudeDegrees','AltitudeMeters','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ'])
    y_data_to_train_gbm = y_data_to_train_gbm.drop(columns=['utcTimeMillis','SvVelocityYEcefMetersPerSecond','SvVelocityZEcefMetersPerSecond','SvVelocityXEcefMetersPerSecond',
                                                    'RawPseudorangeMeters','SvClockBiasMeters','IsrbMeters','TroposphericDelayMeters','IonosphericDelayMeters',
                                                    'WlsPositionXEcefMeters','WlsPositionYEcefMeters','WlsPositionZEcefMeters','Accel_MeasurementX','Accel_MeasurementY',
                                                    'Accel_MeasurementZ','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ','Gyro_MeasurementX','Gyro_MeasurementY',
                                                    'Gyro_MeasurementZ','AltitudeMeters'])
    x_data_to_train_wt_gbm = check_and_clean_data(x_data_to_train_wt_gbm)
    y_data_to_train_gbm = check_and_clean_data(y_data_to_train_gbm)
    # Train the SVM MODEL:
    model_gbm.fit( x_data_to_train_wt_gbm, y_data_to_train_gbm)

NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.


- Train KNN model

In [58]:
for doc_name in csv_files:
    #Reading the csv files
    data = pd.read_csv('train_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_train_knn = data.copy() 
    y_data_to_train_knn = data.copy()

    x_data_to_train_wt_knn = x_data_to_train_knn.drop(columns=['utcTimeMillis','LatitudeDegrees','LongitudeDegrees','AltitudeMeters','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ'])
    y_data_to_train_knn = y_data_to_train_knn.drop(columns=['utcTimeMillis','SvVelocityYEcefMetersPerSecond','SvVelocityZEcefMetersPerSecond','SvVelocityXEcefMetersPerSecond',
                                                    'RawPseudorangeMeters','SvClockBiasMeters','IsrbMeters','TroposphericDelayMeters','IonosphericDelayMeters',
                                                    'WlsPositionXEcefMeters','WlsPositionYEcefMeters','WlsPositionZEcefMeters','Accel_MeasurementX','Accel_MeasurementY',
                                                    'Accel_MeasurementZ','Mag_MeasurementX','Mag_MeasurementY','Mag_MeasurementZ','Gyro_MeasurementX','Gyro_MeasurementY',
                                                    'Gyro_MeasurementZ','AltitudeMeters'])
    x_data_to_train_wt_knn = check_and_clean_data(x_data_to_train_wt_knn)
    y_data_to_train_knn = check_and_clean_data(y_data_to_train_knn)
    # Train the SVM MODEL:
    model_knn.fit( x_data_to_train_wt_knn, y_data_to_train_knn)

NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.
NaN değerler bulundu ve temizlendi.


## TEST

In [4]:
samp_sum = pd.read_csv('sample_submission.csv')
ten_digit_reference_df = pd.DataFrame(columns=samp_sum.columns)
ten_digit_reference_df['tripId'] =  samp_sum['tripId']
ten_digit_reference_df['UnixTimeMillis'] =  samp_sum['UnixTimeMillis']
ten_digit_reference_df['UnixTimeMillis'] = ten_digit_reference_df['UnixTimeMillis'].astype(str).str[:10].astype(int)

In [8]:
ten_digit_reference_df.shape

(71936, 4)

Creating empty table

In [5]:
total_x_test = pd.DataFrame(index=range(71936), columns=['tripId', 'UnixTimeMillis', 'SvVelocityYEcefMetersPerSecond', 'SvVelocityZEcefMetersPerSecond',
                                                   'SvVelocityXEcefMetersPerSecond', 'RawPseudorangeMeters', 'SvClockBiasMeters', 'IsrbMeters', 'TroposphericDelayMeters',
                                                   'IonosphericDelayMeters', 'WlsPositionXEcefMeters', 'WlsPositionYEcefMeters', 'WlsPositionZEcefMeters','Accel_MeasurementX',
                                                     'Accel_MeasurementY', 'Accel_MeasurementZ','Mag_MeasurementX', 'Mag_MeasurementY','Mag_MeasurementZ',
                                                       'Gyro_MeasurementX', 'Gyro_MeasurementY', 'Gyro_MeasurementZ'])
total_x_test['tripId'] = ten_digit_reference_df['tripId'] 
total_x_test['UnixTimeMillis'] = ten_digit_reference_df['UnixTimeMillis'] 

In [47]:
counter = 0
for main_doc_name in test_doc_list:
    sub_sub_list = list_subdirectories('test'+'/'+main_doc_name)
    for doc_name_list in sub_sub_list:

        #GNSS Data Part:

        #Reading the present gnss csv file
        df_gnss = pd.read_csv('test'+'/'+main_doc_name+'/'+doc_name_list+'/'+'device_gnss.csv', low_memory=False)

        data_gnss = pd.DataFrame()
        #Taking only the necessary columns
        data_gnss['utcTimeMillis'] = df_gnss["utcTimeMillis"]
        data_gnss['SvVelocityYEcefMetersPerSecond']= df_gnss["SvVelocityYEcefMetersPerSecond"]
        data_gnss['SvVelocityZEcefMetersPerSecond'] = df_gnss['SvVelocityZEcefMetersPerSecond']
        data_gnss['SvVelocityXEcefMetersPerSecond'] = df_gnss['SvVelocityXEcefMetersPerSecond']
        data_gnss['RawPseudorangeMeters'] = df_gnss['RawPseudorangeMeters']
        data_gnss['SvClockBiasMeters'] = df_gnss['SvClockBiasMeters']
        data_gnss['IsrbMeters'] = df_gnss['IsrbMeters']
        data_gnss['TroposphericDelayMeters'] = df_gnss['TroposphericDelayMeters']
        data_gnss['IonosphericDelayMeters'] = df_gnss['IonosphericDelayMeters']
        data_gnss['WlsPositionXEcefMeters'] = df_gnss['WlsPositionXEcefMeters']
        data_gnss['WlsPositionYEcefMeters'] = df_gnss['WlsPositionYEcefMeters']
        data_gnss['WlsPositionZEcefMeters'] = df_gnss['WlsPositionZEcefMeters']
                    

        #If more than one measurement is taken in the same second, average the measurements taken for the same time period and create one line of data for each second
        ten_digit_gnss = data_gnss.copy()
        ten_digit_gnss['utcTimeMillis'] = utc_to_unix_millis(ten_digit_gnss['utcTimeMillis'])
        ten_digit_gnss['utcTimeMillis'] = data_gnss['utcTimeMillis'].astype(str).str[:10].astype(int)
        gnss_data = ten_digit_gnss.groupby('utcTimeMillis').mean().reset_index()
        gnss_data.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
        gnss_data = gnss_data.fillna(method='ffill')


        #IMU Data Part:
        #Reading the present imu csv file
        df_imu = pd.read_csv('test'+'/'+main_doc_name+'/'+doc_name_list+'/'+'device_imu.csv', low_memory=False)

        #Taking only the necessary columns
        data_imu = pd.DataFrame()
        data_imu['MessageType'] = df_imu["MessageType"]
        data_imu['utcTimeMillis'] = df_imu["utcTimeMillis"]
        data_imu['MeasurementX'] = df_imu["MeasurementX"]
        data_imu['MeasurementY'] =df_imu["MeasurementY"]
        data_imu['MeasurementZ'] =df_imu['MeasurementZ']
        
        
        #Since the accelerometer, gyro and magnetometer are combined in the same sensor in the IMU, it can be seen that the data received from these sensors 
        #are stored in a mixed manner. To fix this and evaluate each sensor data on its own, three sensor data are transferred to three different dataframes
        accel_imu_data = data_imu.loc[data_imu['MessageType'] == 'UncalAccel']
        gyro_imu_data = data_imu.loc[data_imu['MessageType'] == 'UncalGyro']
        
        #For the next step empty DataFrames are created
        ten_digit_accel = accel_imu_data.copy()
        ten_digit_accel['utcTimeMillis'] = utc_to_unix_millis(ten_digit_accel['utcTimeMillis'])

        ten_digit_gyro = gyro_imu_data.copy()
        ten_digit_gyro['utcTimeMillis'] = utc_to_unix_millis(ten_digit_gyro['utcTimeMillis'])

        #Data is currently processed and stored with millisecond precision, removing the last three digits to convert it to seconds precision
        ten_digit_accel['utcTimeMillis'] = accel_imu_data['utcTimeMillis'].astype(str).str[:10].astype(int)
        ten_digit_gyro['utcTimeMillis'] = gyro_imu_data['utcTimeMillis'].astype(str).str[:10].astype(int)
        
        #If more than one measurement is taken in the same second, average the measurements taken for the same time period and create one line of data for each second
        imu_accel_df = ten_digit_accel.groupby('utcTimeMillis').mean().reset_index()
        imu_gyro_df = ten_digit_gyro.groupby('utcTimeMillis').mean().reset_index()

        imu_accel_df.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
        imu_gyro_df.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)

        imu_accel_df = imu_accel_df.fillna(method='ffill')
        imu_gyro_df = imu_gyro_df.fillna(method='ffill')


        #Since measurements for each sensor are recorded with the same column name, column names are customized for each sensor type to avoid confusion when combining data
        imu_accel_df = imu_accel_df.rename(columns={'MeasurementX': 'Accel_MeasurementX'})
        imu_accel_df = imu_accel_df.rename(columns={'MeasurementY': 'Accel_MeasurementY'})
        imu_accel_df = imu_accel_df.rename(columns={'MeasurementZ': 'Accel_MeasurementZ'})
        imu_gyro_df = imu_gyro_df.rename(columns={'MeasurementX': 'Gyro_MeasurementX'})
        imu_gyro_df = imu_gyro_df.rename(columns={'MeasurementY': 'Gyro_MeasurementY'})
        imu_gyro_df = imu_gyro_df.rename(columns={'MeasurementZ': 'Gyro_MeasurementZ'})

        

        #Recombining the processed imu data. Here, the same rows are determined by time, rows with the same time value are combined and the columns of 
        #accel, mag, and gyro measurements are stored separately. 
        total_imu_df = pd.merge(imu_accel_df, imu_gyro_df, on='UnixTimeMillis', how='inner')

        total_imu_df = total_imu_df.fillna(method='ffill')
        
        #Finally, gnss and imu data are paired to take time into account and a single data is created. Thus, the 'X' part of the 
        #data set that will be used later when training the model is created
        x_part = pd.merge(gnss_data, total_imu_df, on='UnixTimeMillis', how='inner')
        
        #Last check for NAN values
        x_part = x_part.fillna(method='ffill')
        
        
        file_name = f'test_data/{counter}.csv'
        file_name = file_name[:-4] + "_1.csv"
        # Save the DataFrame to CSV
        x_part.to_csv(file_name, index=False)
        print("file number : {}".format(counter))
        counter = 1+ counter


file number : 0
file number : 1
file number : 2
file number : 3
file number : 4
file number : 5
file number : 6
file number : 7
file number : 8
file number : 9
file number : 10
file number : 11
file number : 12
file number : 13
file number : 14
file number : 15
file number : 16
file number : 17
file number : 18
file number : 19
file number : 20
file number : 21
file number : 22
file number : 23
file number : 24
file number : 25
file number : 26
file number : 27
file number : 28
file number : 29
file number : 30
file number : 31
file number : 32
file number : 33
file number : 34
file number : 35
file number : 36
file number : 37
file number : 38
file number : 39


In [6]:
# List all files in the directory
file_names = os.listdir('test_data')

# Filter CSV files
doc_list = [file for file in file_names if file.endswith('.csv')]

Predict CNN-GRU

In [18]:
predictions_cnn_gru = pd.DataFrame(columns=['tripId', 'utcTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])

for doc_name in doc_list:
    #Reading the csv files
    data = pd.read_csv('test_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_test_wt_cnn_gru = data.copy() 

    x_data_to_test_wt_cnn_gru = x_data_to_test_wt_cnn_gru.drop(columns=['utcTimeMillis', 'tripId', 'utcTimeSeconds'])
    #print('x_data_to_train_wt', x_data_to_test_wt_cnn_gru)
    x_data_to_test_wt_cnn_gru.rename(columns={'acc_measurementX': 'Accel_MeasurementX'}, inplace=True)
    x_data_to_test_wt_cnn_gru.rename(columns={'acc_measurementY': 'Accel_MeasurementY'}, inplace=True)
    x_data_to_test_wt_cnn_gru.rename(columns={'acc_measurementZ': 'Accel_MeasurementZ'}, inplace=True)
    x_data_to_test_wt_cnn_gru.rename(columns={'gyr_MeasurementX': 'Gyro_MeasurementX'}, inplace=True)
    x_data_to_test_wt_cnn_gru.rename(columns={'gyr_MeasurementY': 'Gyro_MeasurementY'}, inplace=True)
    x_data_to_test_wt_cnn_gru.rename(columns={'gyr_MeasurementZ': 'Gyro_MeasurementZ'}, inplace=True)

    # Predict with the RNN MODEL:
    predicted_cnn_gru = model_cnn_gru.predict(x_data_to_test_wt_cnn_gru)
    #print(predicted_cnn_gru)
    # Create a DataFrame with predicted values and add tripId and utcTimeMillis from original data
    predicted_cnn_gru = pd.DataFrame(predicted_cnn_gru, columns=['LatitudeDegrees', 'LongitudeDegrees'])
    predicted_cnn_gru['tripId'] = data['tripId'].iloc[0]  # Assuming each document has the same tripId
    
    # Concatenate the DataFrame with predictions to predictions_lr DataFrame
    predictions_cnn_gru = pd.concat([predictions_cnn_gru, predicted_cnn_gru], ignore_index=True)



In [20]:
predictions_cnn_gru.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
predictions_cnn_gru['UnixTimeMillis'] = samp_sum['UnixTimeMillis']
predictions_cnn_gru['tripId'] = samp_sum['tripId']

In [21]:
predictions_cnn_gru=pd.DataFrame(predictions_cnn_gru)
predictions_cnn_gru=predictions_cnn_gru.sort_values(by='UnixTimeMillis')
predictions_cnn_gru.to_csv('cnn_gru_predictions.csv', index=False)

Predict with RNN

In [43]:
predictions_rnn = pd.DataFrame(columns=['tripId', 'utcTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])

for doc_name in doc_list:
    #Reading the csv files
    data = pd.read_csv('test_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_test_wt_rnn = data.copy() 

    x_data_to_test_wt_rnn = x_data_to_test_wt_rnn.drop(columns=['utcTimeMillis', 'tripId', 'utcTimeSeconds'])
    #print('x_data_to_train_wt', x_data_to_test_wt_rnn)
    x_data_to_test_wt_rnn.rename(columns={'acc_measurementX': 'Accel_MeasurementX'}, inplace=True)
    x_data_to_test_wt_rnn.rename(columns={'acc_measurementY': 'Accel_MeasurementY'}, inplace=True)
    x_data_to_test_wt_rnn.rename(columns={'acc_measurementZ': 'Accel_MeasurementZ'}, inplace=True)
    x_data_to_test_wt_rnn.rename(columns={'gyr_MeasurementX': 'Gyro_MeasurementX'}, inplace=True)
    x_data_to_test_wt_rnn.rename(columns={'gyr_MeasurementY': 'Gyro_MeasurementY'}, inplace=True)
    x_data_to_test_wt_rnn.rename(columns={'gyr_MeasurementZ': 'Gyro_MeasurementZ'}, inplace=True)

    # Predict with the RNN MODEL:
    predicted_rnn = model_rnn.predict(x_data_to_test_wt_rnn)
    print(predicted_rnn)
    # Create a DataFrame with predicted values and add tripId and utcTimeMillis from original data
    predicted_rnn = pd.DataFrame(predicted_rnn, columns=['LatitudeDegrees', 'LongitudeDegrees'])
    predicted_rnn['tripId'] = data['tripId'].iloc[0]  # Assuming each document has the same tripId
    
    # Concatenate the DataFrame with predictions to predictions_lr DataFrame
    predictions_rnn = pd.concat([predictions_rnn, predicted_rnn], ignore_index=True)

[[ 1.2754658  -0.7616583 ]
 [ 1.2789261  -0.73985445]
 [ 1.2832357  -0.7167431 ]
 ...
 [ 1.2800423  -0.7399106 ]
 [ 1.2925808  -0.71307355]
 [ 1.273687   -0.7353715 ]]
[[ 1.3375036  -0.7982865 ]
 [ 1.3367231  -0.7946862 ]
 [ 1.3197876  -0.83250505]
 ...
 [ 1.3209842  -0.82180184]
 [ 1.3243902  -0.8241152 ]
 [ 1.3028429  -0.81178606]]
[[ 1.3604819  -0.75259525]
 [ 1.3536344  -0.77845526]
 [ 1.3715229  -0.74586505]
 ...
 [ 1.3552103  -0.78050876]
 [ 1.3541467  -0.7806422 ]
 [ 1.3549595  -0.7779005 ]]
[[ 1.3146343  -0.7648975 ]
 [ 1.329616   -0.75186026]
 [ 1.3145576  -0.7607625 ]
 ...
 [ 0.8359213  -0.55747026]
 [ 0.86065906 -0.57752186]
 [ 0.8389469  -0.57362086]]
[[ 1.3482311  -0.77002424]
 [ 1.342923   -0.7819521 ]
 [ 1.3422307  -0.7784969 ]
 ...
 [ 1.3378108  -0.77931225]
 [ 1.3374561  -0.7770502 ]
 [ 1.3537819  -0.7832607 ]]
[[ 1.3718989  -0.7682093 ]
 [ 1.3766505  -0.765824  ]
 [ 1.3842522  -0.76147175]
 ...
 [ 1.3654577  -0.79034084]
 [ 1.3632629  -0.77121484]
 [ 1.3638178  -0.770

In [44]:
predictions_rnn.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
predictions_rnn['UnixTimeMillis'] = samp_sum['UnixTimeMillis']
predictions_rnn['tripId'] = samp_sum['tripId']

In [45]:
predictions_rnn=pd.DataFrame(predictions_rnn)
predictions_rnn=predictions_rnn.sort_values(by='UnixTimeMillis')
predictions_rnn.to_csv('rnn_predictions.csv', index=False)

Predict with LR

In [73]:
predictions_lr = pd.DataFrame(columns=['tripId', 'utcTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])

for doc_name in doc_list:
    # Reading the csv files
    data = pd.read_csv('test_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_test_lr = data.copy() 

    x_data_to_test_wt_lr = x_data_to_test_lr.drop(columns=['utcTimeMillis', 'tripId', 'utcTimeSeconds'])
    rename_dict = {'acc_measurementX': 'Accel_MeasurementX',
                    'acc_measurementY': 'Accel_MeasurementY',
                    'acc_measurementZ': 'Accel_MeasurementZ',
                    'gyr_MeasurementX': 'Gyro_MeasurementX',
                    'gyr_MeasurementY': 'Gyro_MeasurementY',
                    'gyr_MeasurementZ': 'Gyro_MeasurementZ'}
    x_data_to_test_wt_lr.rename(columns=rename_dict, inplace=True)

    # Predict with the RNN MODEL:
    predicted_lr = model_lr.predict(x_data_to_test_wt_lr)
    
    # Create a DataFrame with predicted values and add tripId and utcTimeMillis from original data
    predicted_df = pd.DataFrame(predicted_lr, columns=['LatitudeDegrees', 'LongitudeDegrees'])
    predicted_df['tripId'] = data['tripId'].iloc[0]  # Assuming each document has the same tripId
    
    # Concatenate the DataFrame with predictions to predictions_lr DataFrame
    predictions_lr = pd.concat([predictions_lr, predicted_df], ignore_index=True)


Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- 

In [81]:
predictions_lr.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
predictions_lr['UnixTimeMillis'] = samp_sum['UnixTimeMillis']
predictions_lr['tripId'] = samp_sum['tripId']

In [83]:
predictions_lr=pd.DataFrame(predictions_lr)
predictions_lr=predictions_lr.sort_values(by='UnixTimeMillis')
predictions_lr.to_csv('lr_predictions.csv', index=False)

Predict with Random Forest

In [21]:
predictions_rf = pd.DataFrame(columns=['tripId', 'utcTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])

for doc_name in doc_list:
    # Reading the csv files
    data = pd.read_csv('test_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_test_rf = data.copy() 

    x_data_to_test_wt_rf = x_data_to_test_rf.drop(columns=['utcTimeMillis', 'tripId', 'utcTimeSeconds'])
    rename_dict = {'acc_measurementX': 'Accel_MeasurementX',
                    'acc_measurementY': 'Accel_MeasurementY',
                    'acc_measurementZ': 'Accel_MeasurementZ',
                    'gyr_MeasurementX': 'Gyro_MeasurementX',
                    'gyr_MeasurementY': 'Gyro_MeasurementY',
                    'gyr_MeasurementZ': 'Gyro_MeasurementZ'}
    x_data_to_test_wt_rf.rename(columns=rename_dict, inplace=True)

    # Predict with the RNN MODEL:
    predicted_rf = model_rf.predict(x_data_to_test_wt_rf)
    
    # Create a DataFrame with predicted values and add tripId and utcTimeMillis from original data
    predicted_rf = pd.DataFrame(predicted_rf, columns=['LatitudeDegrees', 'LongitudeDegrees'])
    predicted_rf['tripId'] = data['tripId'].iloc[0]  # Assuming each document has the same tripId
    
    # Concatenate the DataFrame with predictions to predictions_lr DataFrame
    predictions_rf = pd.concat([predictions_rf, predicted_rf], ignore_index=True)


Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- 

In [30]:
predictions_rf.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
predictions_rf['UnixTimeMillis'] = samp_sum['UnixTimeMillis']
predictions_rf['tripId'] = samp_sum['tripId']

In [31]:
predictions_rf=pd.DataFrame(predictions_rf)
predictions_rf=predictions_rf.sort_values(by='UnixTimeMillis')
predictions_rf.to_csv('rf_predictions.csv', index=False)

Predict with SVM

In [25]:
predictions_svm = pd.DataFrame(columns=['tripId', 'utcTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])

for doc_name in doc_list:
    # Reading the csv files
    data = pd.read_csv('test_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_test_svm = data.copy() 

    x_data_to_test_wt_svm = x_data_to_test_svm.drop(columns=['utcTimeMillis', 'tripId', 'utcTimeSeconds'])
    rename_dict = {'acc_measurementX': 'Accel_MeasurementX',
                    'acc_measurementY': 'Accel_MeasurementY',
                    'acc_measurementZ': 'Accel_MeasurementZ',
                    'gyr_MeasurementX': 'Gyro_MeasurementX',
                    'gyr_MeasurementY': 'Gyro_MeasurementY',
                    'gyr_MeasurementZ': 'Gyro_MeasurementZ'}
    x_data_to_test_wt_svm.rename(columns=rename_dict, inplace=True)

    # Predict with the RNN MODEL:
    predicted_svm = model_svm.predict(x_data_to_test_wt_svm)
    
    # Create a DataFrame with predicted values and add tripId and utcTimeMillis from original data
    predicted_svm = pd.DataFrame(predicted_svm, columns=['LatitudeDegrees', 'LongitudeDegrees'])
    predicted_svm['tripId'] = data['tripId'].iloc[0]  # Assuming each document has the same tripId
    
    # Concatenate the DataFrame with predictions to predictions_lr DataFrame
    predictions_svm = pd.concat([predictions_svm, predicted_svm], ignore_index=True)


Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- 

In [28]:
predictions_svm.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
predictions_svm['UnixTimeMillis'] = samp_sum['UnixTimeMillis']
predictions_svm['tripId'] = samp_sum['tripId']

In [29]:
predictions_svm=pd.DataFrame(predictions_svm)
predictions_svm=predictions_svm.sort_values(by='UnixTimeMillis')
predictions_svm.to_csv('svm_predictions.csv', index=False)

Prdict GBM

In [52]:
predictions_gbm = pd.DataFrame(columns=['tripId', 'utcTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])

for doc_name in doc_list:
    # Reading the csv files
    data = pd.read_csv('test_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_test_gbm = data.copy() 

    x_data_to_test_wt_gbm = x_data_to_test_gbm.drop(columns=['utcTimeMillis', 'tripId', 'utcTimeSeconds'])
    rename_dict = {'acc_measurementX': 'Accel_MeasurementX',
                    'acc_measurementY': 'Accel_MeasurementY',
                    'acc_measurementZ': 'Accel_MeasurementZ',
                    'gyr_MeasurementX': 'Gyro_MeasurementX',
                    'gyr_MeasurementY': 'Gyro_MeasurementY',
                    'gyr_MeasurementZ': 'Gyro_MeasurementZ'}
    x_data_to_test_wt_gbm.rename(columns=rename_dict, inplace=True)

    # Predict with the RNN MODEL:
    predicted_gbm= model_gbm.predict(x_data_to_test_wt_gbm)
    
    # Create a DataFrame with predicted values and add tripId and utcTimeMillis from original data
    predicted_gbm= pd.DataFrame(predicted_gbm, columns=['LatitudeDegrees', 'LongitudeDegrees'])
    predicted_gbm['tripId'] = data['tripId'].iloc[0]  # Assuming each document has the same tripId
    
    # Concatenate the DataFrame with predictions to predictions_lr DataFrame
    predictions_gbm = pd.concat([predictions_gbm, predicted_gbm], ignore_index=True)


Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- 

In [53]:
predictions_gbm.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
predictions_gbm['UnixTimeMillis'] = samp_sum['UnixTimeMillis']
predictions_gbm['tripId'] = samp_sum['tripId']

In [54]:
predictions_gbm=pd.DataFrame(predictions_gbm)
predictions_gbm=predictions_gbm.sort_values(by='UnixTimeMillis')
predictions_gbm.to_csv('gbm_predictions.csv', index=False)

Predict with KNN

In [59]:
predictions_knn = pd.DataFrame(columns=['tripId', 'utcTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])

for doc_name in doc_list:
    # Reading the csv files
    data = pd.read_csv('test_data'+'/'+ doc_name, low_memory=False)
    
    x_data_to_test_knn = data.copy() 

    x_data_to_test_wt_knn = x_data_to_test_knn.drop(columns=['utcTimeMillis', 'tripId', 'utcTimeSeconds'])
    rename_dict = {'acc_measurementX': 'Accel_MeasurementX',
                    'acc_measurementY': 'Accel_MeasurementY',
                    'acc_measurementZ': 'Accel_MeasurementZ',
                    'gyr_MeasurementX': 'Gyro_MeasurementX',
                    'gyr_MeasurementY': 'Gyro_MeasurementY',
                    'gyr_MeasurementZ': 'Gyro_MeasurementZ'}
    x_data_to_test_wt_knn.rename(columns=rename_dict, inplace=True)

    # Predict with the RNN MODEL:
    predicted_knn= model_knn.predict(x_data_to_test_wt_knn)
    
    # Create a DataFrame with predicted values and add tripId and utcTimeMillis from original data
    predicted_knn= pd.DataFrame(predicted_knn, columns=['LatitudeDegrees', 'LongitudeDegrees'])
    predicted_knn['tripId'] = data['tripId'].iloc[0]  # Assuming each document has the same tripId
    
    # Concatenate the DataFrame with predictions to predictions_lr DataFrame
    predictions_knn = pd.concat([predictions_knn, predicted_knn], ignore_index=True)


Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- Accel_MeasurementZ

Feature names unseen at fit time:
- acc_MeasurementX
- acc_MeasurementY
- acc_MeasurementZ
Feature names seen at fit time, yet now missing:
- Accel_MeasurementX
- Accel_MeasurementY
- 

In [60]:
predictions_knn.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
predictions_knn['UnixTimeMillis'] = samp_sum['UnixTimeMillis']
predictions_knn['tripId'] = samp_sum['tripId']

In [61]:
predictions_knn=pd.DataFrame(predictions_knn)
predictions_knn=predictions_knn.sort_values(by='UnixTimeMillis')
predictions_knn.to_csv('knn_predictions.csv', index=False)

Reference values, x,y,z to latitude and longitude

In [7]:
# WGS84 ellipsoid constants
a = 6378137.0  # semi-major axis in meters
b = 6356752.314245  # semi-minor axis in meters
e2 = 1 - (b**2 / a**2)  # first eccentricity squared
e2_prime = (a**2 - b**2) / b**2  # second eccentricity squared

def ecef_to_geodetic(x, y, z):
    p = np.sqrt(x**2 + y**2)
    theta = np.arctan2(z * a, p * b)
    
    # Calculate latitude
    lat = np.arctan2(z + e2_prime * b * np.sin(theta)**3, p - e2 * a * np.cos(theta)**3)
    
    # Calculate longitude
    lon = np.arctan2(y, x)
    
    # Convert from radians to degrees
    lat = np.degrees(lat)
    lon = np.degrees(lon)
    
    return lat, lon

In [8]:
reference = pd.DataFrame(columns=['tripId', 'utcTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])
for doc_name in doc_list:
    # Reading the csv files
    data = pd.read_csv('test_data'+'/'+ doc_name, low_memory=False)
    
    X = data.copy() 
    X_x = X['WlsPositionXEcefMeters']
    X_y = X['WlsPositionYEcefMeters']
    X_z = X['WlsPositionZEcefMeters']
    latitude, longitude = ecef_to_geodetic(X_x, X_y, X_z)
    data = {
    'LatitudeDegrees': latitude,
    'LongitudeDegrees': longitude
    }
    calculated_lat_long = pd.DataFrame(data)
    # Create a DataFrame with predicted values and add tripId and utcTimeMillis from original data
    calculated_lat_long= pd.DataFrame(calculated_lat_long, columns=['LatitudeDegrees', 'LongitudeDegrees'])
    # Concatenate the DataFrame with predictions to predictions_lr DataFrame
    reference = pd.concat([reference, calculated_lat_long], ignore_index=True)

In [9]:
reference.rename(columns={'utcTimeMillis': 'UnixTimeMillis'}, inplace=True)
reference['UnixTimeMillis'] = samp_sum['UnixTimeMillis']
reference['tripId'] = samp_sum['tripId']

In [10]:
reference=pd.DataFrame(reference)
reference=reference.sort_values(by='UnixTimeMillis')
reference.to_csv('reference.csv', index=False)