<h1> Import Libraries </h1>

In [1]:
import pandas as pd
import os

<h1> Initialize functions and objects </h1>

In [2]:
#trip object to store properties, using average of all values and total time
class trip:
    def __init__(self,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed):
        self.bookingID = bookingID
        self.Accuracy = sum(Accuracy)/len(Accuracy)
        self.Bearing = sum(Bearing)/len(Bearing)
        self.acceleration_x = sum(acceleration_x)/len(acceleration_x)
        self.acceleration_y = sum(acceleration_y)/len(acceleration_y)
        self.acceleration_z = sum(acceleration_z)/len(acceleration_z)
        self.gyro_x = sum(gyro_x)/len(gyro_x)
        self.gyro_y = sum(gyro_y)/len(gyro_y)
        self.gyro_z = sum(gyro_z)/len(gyro_z) 
        self.second = max(second)
        self.Speed = sum(Speed)/len(Speed)

#function to calculate euclicean distance between 2 trips
def euc_dist(trip1,trip2):
    dist = (
            (trip1.acceleration_x-trip2.acceleration_x)**2 +
            (trip1.acceleration_y-trip2.acceleration_y)**2 +
            (trip1.acceleration_z-trip2.acceleration_z)**2 +
            (trip1.gyro_x-trip2.gyro_x)**2 +
            (trip1.gyro_y-trip2.gyro_y)**2 +
            (trip1.gyro_z-trip2.gyro_z)**2 +
            (trip1.Speed - trip2.Speed)**2
        ) ** 0.5
    return dist
   
#similarity is computed and normalized to be a number from 0 to 1, with 1 being identical
def find_similarity(trip1,trip2):
    return 1/(1+euc_dist(trip1,trip2))

#function to retrieve highest similarity value
def find_most_similar(trip,trained_data):
    highest_similarity = 0
    most_similar = 0
    for bookingID, trained_trip in trained_data.items():
        similarity = find_similarity(trip,trained_trip)
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar = bookingID
    return {"highest_similarity": highest_similarity,
           "most_similar": most_similar}

<h1> Create dataframe of training data </h1>

In [3]:
training_df = pd.read_csv("Training File/part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv")
training_df = training_df[training_df.label == 1]

In [4]:
training_df.head(10)

Unnamed: 0,bookingID,label
1,335007449205,1
9,274877906944,1
13,1357209665706,1
19,841813590179,1
20,1700807049377,1
29,678604832930,1
31,1348619730972,1
35,506806141023,1
38,420906795145,1
39,1271310319684,1


<h1> Compile all data </h1>

In [5]:
first_file = True

for filename in os.listdir("Raw Data"):
    if filename.endswith(".csv"):
        print("Adding " + filename)
        if first_file == True:
            master_df = pd.read_csv("Raw Data/" + filename)
            first_file = False
        else:
            master_df = master_df.append(pd.read_csv("Raw Data/" + filename))
print("Files all added into master dataframe")

Adding part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Adding part-00009-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv
Files all added into master dataframe


In [6]:
master_df = master_df.sort_values(by = ["bookingID","second"], ascending = [1,1])
master_df = master_df.reset_index(drop = True)

In [7]:
master_df.head(10)

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,0,12.0,143.298294,0.818112,-9.941461,-2.014999,-0.016245,-0.09404,0.070732,0.0,3.442991
1,0,8.0,143.298294,0.546405,-9.83559,-2.038925,-0.047092,-0.078874,0.043187,1.0,0.228454
2,0,8.0,143.298294,-1.706207,-9.270792,-1.209448,-0.028965,-0.032652,0.01539,2.0,0.228454
3,0,8.0,143.298294,-1.416705,-9.548032,-1.860977,-0.022413,0.005049,-0.025753,3.0,0.228454
4,0,8.0,143.298294,-0.598145,-9.853534,-1.378574,-0.014297,-0.046206,0.021902,4.0,0.228454
5,0,8.0,143.298294,-0.608313,-9.539658,-1.794583,-0.007538,-0.023838,0.018068,5.0,0.228454
6,0,8.0,143.298294,-0.867758,-9.698615,-1.615439,0.022728,-0.012178,0.005982,6.0,0.228454
7,0,8.0,143.298294,-1.05079,-9.74527,-1.411771,0.027603,0.001841,0.000904,7.0,0.228454
8,0,8.0,143.298294,-0.721213,-9.960004,-1.202271,0.001864,-0.007702,0.014018,8.0,0.228454
9,0,8.0,143.298294,-0.346924,-9.532629,-1.204663,0.014962,-0.050033,0.025118,9.0,0.228454


<h1> Get properties of dangerous vehicles in training data</h1>

In [8]:
dangerous_df = master_df[master_df.bookingID.isin(training_df.bookingID.values.tolist())]

In [9]:
dangerous_df.head(10)

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
1004,1,4.0,252.0,-1.394632,10.174141,-2.50377,0.016127,0.212284,-0.039776,0.0,4.756427
1005,1,4.0,236.0,-0.840489,8.749026,-3.274607,0.037472,0.134862,-0.011083,1.0,4.937211
1006,1,4.0,228.0,-1.944456,8.98903,-2.978358,-0.024487,0.102224,-0.01487,2.0,5.54421
1007,1,4.0,222.0,-0.820675,9.282243,-2.126961,0.007522,0.089431,0.006004,3.0,6.56403
1008,1,4.0,216.0,-1.020284,9.382215,-2.618797,-0.019007,0.002409,-0.014382,4.0,7.358976
1009,1,4.0,214.0,-0.745468,9.504243,-1.587384,0.006283,0.055763,-0.00672,5.0,8.297936
1010,1,4.0,212.0,-0.485889,9.466089,-1.315164,0.001012,0.000541,0.010455,6.0,8.477888
1011,1,4.0,212.0,-1.118102,9.704149,-1.796915,-0.012619,-0.011938,-0.029374,7.0,7.977206
1012,1,4.0,213.0,-1.050183,9.387703,-1.199332,-0.035727,-0.013491,-0.022881,8.0,7.732447
1013,1,4.0,211.0,-1.055,9.894306,-1.811855,-0.003787,0.002496,-0.024627,9.0,7.140307


In [10]:
#group by bookingID
grouped_dangerous_df = dangerous_df.groupby("bookingID")

<h1> Create Dictionary of training data </h1>

In [11]:
#create dictionary of objects to look up
trained_data = {}
for bookingID,properties in grouped_dangerous_df:
    Accuracy = properties.Accuracy.values.tolist()
    Bearing = properties.Bearing.values.tolist()
    acceleration_x = properties.acceleration_x.values.tolist()
    acceleration_y = properties.acceleration_y.values.tolist()
    acceleration_z = properties.acceleration_z.values.tolist()
    gyro_x = properties.gyro_x.values.tolist()
    gyro_y = properties.gyro_y.values.tolist()
    gyro_z = properties.gyro_z.values.tolist()
    second = properties.second.values.tolist()
    Speed = properties.Speed.values.tolist()
    new_trip = trip(bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed)
    trained_data[bookingID] = new_trip

<h1> Group Master Dataframe by bookingID </h1>

In [12]:
#group master_df by bookingID
grouped_master_df = master_df.groupby("bookingID")

In [13]:
#create dictionary of raw data to be iterated through
raw_data = {}
for bookingID,properties in grouped_master_df:
    Accuracy = properties.Accuracy.values.tolist()
    Bearing = properties.Bearing.values.tolist()
    acceleration_x = properties.acceleration_x.values.tolist()
    acceleration_y = properties.acceleration_y.values.tolist()
    acceleration_z = properties.acceleration_z.values.tolist()
    gyro_x = properties.gyro_x.values.tolist()
    gyro_y = properties.gyro_y.values.tolist()
    gyro_z = properties.gyro_z.values.tolist()
    second = properties.second.values.tolist()
    Speed = properties.Speed.values.tolist()
    new_trip = trip(bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed)
    raw_data[bookingID] = new_trip

<h1> Iterate through Raw Data and check if there are trips in training data that are similar </h1>

In [14]:
#threshold of 0.90 is set arbitrally
%%time
threshold = 0.90
dangerous_trips = []
for bookingID, trip in raw_data.items():
    similarity_dict = find_most_similar(trip,trained_data)
    if similarity_dict["highest_similarity"] >= threshold:
        dangerous_trips.append(bookingID)

Wall time: 3min 30s


<h1> Final Output </h1>

In [15]:
final_df = master_df.drop_duplicates(subset = "bookingID", keep = "first")
final_df = final_df.drop(["Accuracy","Bearing","acceleration_x","acceleration_y","acceleration_z","gyro_x","gyro_y","gyro_z","second","Speed"],axis = 1)

In [16]:
def set_label(row,dangerous_trips):
    if row.bookingID in dangerous_trips:
        return 1
    else:
        return 0 

In [17]:
final_df["label"] = final_df.apply(lambda row: set_label(row,dangerous_trips),axis=1)
final_df.to_csv("Dangerous Trips.csv",index=False)

In [18]:
final_df.to_csv("Dangerous Trips.csv",index=False)