In [1]:
# import packages
import pandas as pd
import os
import sys
import mysql.connector
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

import joblib
from joblib import load, dump
import pickle
import json

# Import Data and Investigate

In [2]:
# import data as txt file
read_vehicles = pd.read_csv ('/Users/danhowes1/Desktop/UCD/Research_Project/tmp/data/rt_vehicles_DB_2018.txt', delimiter = ';')


In [3]:
# save data as csv file
read_vehicles.to_csv ('/Users/danhowes1/Desktop/UCD/Research_Project/rt_vehicles_DB_2018.csv', index=None)

In [4]:
# import saved csv file
df_vehicles = pd.read_csv('/Users/danhowes1/Desktop/UCD/Research_Project/rt_vehicles_DB_2018.csv')

In [5]:
# Check how many rows and columns this dataframe has
df_vehicles.shape

(272622, 7)

In [6]:
# check first 10 rows of data
df_vehicles.head(10)

Unnamed: 0,DATASOURCE,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES,LASTUPDATE,NOTE
0,DB,23-NOV-18 00:00:00,3303848,286166,58849,04-DEC-18 08:03:09,
1,DB,23-NOV-18 00:00:00,3303847,259545,56828,04-DEC-18 08:03:09,
2,DB,28-FEB-18 00:00:00,2868329,103096,40967,08-MAR-18 10:35:59,
3,DB,28-FEB-18 00:00:00,2868330,147277,43599,08-MAR-18 10:35:59,
4,DB,28-FEB-18 00:00:00,2868331,224682,40447,08-MAR-18 10:35:59,
5,DB,28-FEB-18 00:00:00,2868332,19499,6289,08-MAR-18 10:35:59,
6,DB,28-FEB-18 00:00:00,2868333,133014,43647,08-MAR-18 10:35:59,
7,DB,28-FEB-18 00:00:00,2868334,168964,47167,08-MAR-18 10:35:59,
8,DB,28-FEB-18 00:00:00,2868335,360842,55477,08-MAR-18 10:35:59,
9,DB,28-FEB-18 00:00:00,2868336,139959,39599,08-MAR-18 10:35:59,


In [7]:
#Now check type of each feature
df_vehicles.dtypes

DATASOURCE       object
DAYOFSERVICE     object
VEHICLEID         int64
DISTANCE          int64
MINUTES           int64
LASTUPDATE       object
NOTE            float64
dtype: object

<b> - Look for duplicate rows and columns. Consider whether it makes sense to keep them or drop them. </b>

In [8]:
#Print the number of duplicates, without the original rows that were duplicated
print('Number of duplicate (excluding first) rows in the table is: ', df_vehicles.duplicated().sum())

# Check for duplicate rows. 
# Use "keep=False" to mark all duplicates as true, including the original rows that were duplicated.
print('Number of duplicate rows (including first) in the table is:', df_vehicles[df_vehicles.duplicated(keep=False)].shape[0])


Number of duplicate (excluding first) rows in the table is:  0
Number of duplicate rows (including first) in the table is: 0


In [9]:
#check number of empty rows per feature
df_vehicles.isnull().sum()

DATASOURCE           0
DAYOFSERVICE         0
VEHICLEID            0
DISTANCE             0
MINUTES              0
LASTUPDATE           0
NOTE            272622
dtype: int64

In [10]:
#check cardinality of each feature
df_vehicles.nunique()

DATASOURCE           1
DAYOFSERVICE       360
VEHICLEID         1152
DISTANCE        170498
MINUTES          57523
LASTUPDATE         360
NOTE                 0
dtype: int64

# Cleaning Dataset

<b> - Drop constant, null and redundant columns </b>

- LASTUPDATE is irrelevant so we can drop that feature

In [11]:
# drop columns with a cardinaltiy of 0 or 1
constant_columns = ['DATASOURCE', 'LASTUPDATE', 'NOTE']

for c in constant_columns:
    df_vehicles = df_vehicles.drop(c, 1)

df_vehicles.head(10)

  df_vehicles = df_vehicles.drop(c, 1)


Unnamed: 0,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES
0,23-NOV-18 00:00:00,3303848,286166,58849
1,23-NOV-18 00:00:00,3303847,259545,56828
2,28-FEB-18 00:00:00,2868329,103096,40967
3,28-FEB-18 00:00:00,2868330,147277,43599
4,28-FEB-18 00:00:00,2868331,224682,40447
5,28-FEB-18 00:00:00,2868332,19499,6289
6,28-FEB-18 00:00:00,2868333,133014,43647
7,28-FEB-18 00:00:00,2868334,168964,47167
8,28-FEB-18 00:00:00,2868335,360842,55477
9,28-FEB-18 00:00:00,2868336,139959,39599


In [12]:
# check number of empty rows
df_vehicles.isnull().sum()

DAYOFSERVICE    0
VEHICLEID       0
DISTANCE        0
MINUTES         0
dtype: int64

In [13]:
# check cardinality of new features
df_vehicles.nunique()

DAYOFSERVICE       360
VEHICLEID         1152
DISTANCE        170498
MINUTES          57523
dtype: int64

<b> - Logic check </b>

In [14]:
# check first 10 rows
df_vehicles.head(10)

Unnamed: 0,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES
0,23-NOV-18 00:00:00,3303848,286166,58849
1,23-NOV-18 00:00:00,3303847,259545,56828
2,28-FEB-18 00:00:00,2868329,103096,40967
3,28-FEB-18 00:00:00,2868330,147277,43599
4,28-FEB-18 00:00:00,2868331,224682,40447
5,28-FEB-18 00:00:00,2868332,19499,6289
6,28-FEB-18 00:00:00,2868333,133014,43647
7,28-FEB-18 00:00:00,2868334,168964,47167
8,28-FEB-18 00:00:00,2868335,360842,55477
9,28-FEB-18 00:00:00,2868336,139959,39599


In [18]:
# check that minutes of operation are not negative
df_vehicles[df_vehicles['MINUTES'] < 0]

Unnamed: 0,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES
35337,16-JAN-18 00:00:00,1000438,19438,-63660
63322,17-MAR-18 00:00:00,2406892,66354,-49317
73317,18-MAY-18 00:00:00,1001231,76,-26906
74817,17-APR-18 00:00:00,1000257,42,-56723
75516,28-MAR-18 00:00:00,2534833,3209,-56686
86099,22-MAY-18 00:00:00,2693217,7331,-53576
155328,29-JUN-18 00:00:00,2172270,603,-55863
160494,21-NOV-18 00:00:00,1000495,102280,-55245
160954,28-NOV-18 00:00:00,1001115,64237,-58114
225571,07-DEC-18 00:00:00,1000211,88,-24695


<b> - We will drop rows with negative minutes since it is impossible for a vehicle to operate for a duration less than 0  </b>

In [21]:
# drop rows with negative minutes
df_vehicles = df_vehicles[df_vehicles['MINUTES'] >= 0]
df_vehicles[df_vehicles['MINUTES'] < 0]

Unnamed: 0,DAYOFSERVICE,VEHICLEID,DISTANCE,MINUTES


In [22]:
# save cleaned data as new csv
df_vehicles.to_csv ('/Users/danhowes1/Desktop/UCD/Research_Project/rt_vehicles_DB_2018_CLEANED.csv', index=None)