In [0]:
def getMissingValuesStats(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.shape[0]*100).sort_values(ascending = False)
    missingValuesStats  = pd.concat([total, percent], axis=1, keys=['Total_Missing', 'Percent_Missing'])
    missingValuesStats = missingValuesStats.loc[missingValuesStats['Total_Missing'] > 0]
    missingValuesStats = missingValuesStats.transpose()
    return missingValuesStats

In [0]:
def plotCorrelationMatrix(data,w,h):
    corr = data.corr()
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    plt.figure(figsize=(w, h))
    sns.heatmap(corr, annot=True,cmap='YlGnBu', mask=mask)
    plt.show()
    return None

In [0]:
def engineCapacityNormalizer(data):
    capByCat = data.groupby('Vehicle_Type').agg({'Engine_Capacity_CC':'mean'}).reset_index()

    capByCat.loc[capByCat['Vehicle_Type'] == 'Mobility scooter','Engine_Capacity_CC'] = 10
    capByCat.loc[capByCat['Vehicle_Type'] == 'Ridden horse','Engine_Capacity_CC'] = 10
    capByCat.loc[capByCat['Vehicle_Type'] == 'Pedal cycle','Engine_Capacity_CC'] = 10
    capByCat.loc[capByCat['Vehicle_Type'] == 'Tram','Engine_Capacity_CC'] = 10

    for vt in data['Vehicle_Type'].unique():
        data.loc[
            (data['Engine_Capacity_CC'].isnull()) & (data['Vehicle_Type'] == vt),
            'Engine_Capacity_CC'] = capByCat[capByCat['Vehicle_Type'] == vt]['Engine_Capacity_CC'].squeeze()
    return data

In [0]:
def TestLogisticRegForPropulsionCode(data):

    X = data[['Propulsion_Code','Vehicle_Type']].copy() #'Engine_Capacity_CC','Age_of_Vehicle' are also NaN
    X = X.dropna()

    y = X['Propulsion_Code']
    X = X.drop('Propulsion_Code',axis=1)

    X = pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # lr = RandomForestClassifier(random_state=0, criterion='entropy', class_weight='balanced',n_jobs=-1)
    lr = LogisticRegression(class_weight='balanced',n_jobs=-1)
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)

    print("Propulsion Code Regression Accuracy", round(accuracy_score(y_pred, y_test)*100,2))
    print(classification_report(digits=2,y_true=y_test,y_pred=y_pred))
    pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

In [0]:
def isWeekendNight(datetime):
    #fridays and saturdays
    dayOfWeek = datetime.dayofweek
    if (dayOfWeek == 4 or dayOfWeek == 5) and (datetime.hour <= 6 or datetime.hour >= 18):
        return True
    #saturday night to sunday morning 
    if (dayOfWeek == 6) and (datetime.hour <= 6):
        return True
    else:
        return False

In [0]:
#Create another column for the numerical Accident Severity 
def accidentSeverityTransformation(x):
    if x in 'Slight': return 1
    if x in 'Serious': return 2
    if x in 'Fatal': return 3
    print(f'WARN: No identified accident severity: {x}')
    return 'Other'

In [0]:
def hourOfDayNormalizer(data):
    data['hour_bin'] = np.digitize(data['hour'], bins=[4,10,14,18],right=True)
    data[data['hour']==23]['hour_bin']
    data['hour_bin_desc'] = ''
    data.loc[data['hour_bin'] == 0, 'hour_bin_desc'] = 'Overnight'
    data.loc[data['hour_bin'] == 1, 'hour_bin_desc'] = 'Morning rush'
    data.loc[data['hour_bin'] == 2, 'hour_bin_desc'] = 'Office hours'
    data.loc[data['hour_bin'] == 3, 'hour_bin_desc'] = 'Afternoon rush'
    data.loc[data['hour_bin'] == 4, 'hour_bin_desc'] = 'Evening'
    return data

In [0]:
def ageVehicleNormalizer(data):
    data['Age_of_Vehicle_bin'] = np.digitize(data['Age_of_Vehicle'], bins=[5,10,15,20],right=True)
    data['Age_of_Vehicle_desc'] = ''
    data.loc[data['Age_of_Vehicle_bin'] == 0, 'Age_of_Vehicle_desc'] = '0-5 years'
    data.loc[data['Age_of_Vehicle_bin'] == 1, 'Age_of_Vehicle_desc'] = '6-10 years'
    data.loc[data['Age_of_Vehicle_bin'] == 2, 'Age_of_Vehicle_desc'] = '11-15 years'
    data.loc[data['Age_of_Vehicle_bin'] == 3, 'Age_of_Vehicle_desc'] = '16-30 years'
    data.loc[data['Age_of_Vehicle_bin'] == 4, 'Age_of_Vehicle_desc'] = '31+ years'

    #deal with nan
    data.loc[data['Age_of_Vehicle'].isnull(),'Age_of_Vehicle_bin'] = -1
    data.loc[data['Age_of_Vehicle'].isnull(),'Age_of_Vehicle_desc'] = 'Unknown'
    return data

In [0]:
def regressionReports(y_test,y_test_preds,methodName='',featImportances=[]):
    print(f'\n {methodName} Report: \n', classification_report(y_test, y_test_preds))
    # print('\n Random Forest Cross Validation: \n', cross_val_score(rf1, X_train, y_train, scoring='f1_macro', n_jobs=-1))
    dataframe = pd.DataFrame(confusion_matrix(y_test, y_test_preds), 
                            index=['Fatal', 'Serious', 'Slight'], 
                            columns=['Fatal', 'Serious', 'Slight'])

    sns.heatmap(dataframe, annot=True, cbar=None, cmap='Blues',fmt='g')
    plt.title(f'Confusion Matrix: {methodName}')
    plt.tight_layout(), plt.xlabel('Real Class'), plt.ylabel('Predicted Class')
    plt.show()

    if (len(featImportances)> 0):
        plt.figure(figsize=(20,10))
        featImportances.nlargest(30).plot(kind='barh')
        plt.title("Top 30 most important features")
        plt.show()
    None

In [0]:
def ageDriverNormalizer(data):
    #Few rows has out of range for age = -1
    data = data.drop(data[data['Age_of_Driver'] <= 1].index)

    data['Age_of_Driver_bin'] = np.digitize(data['Age_of_Driver'], bins=[1,16,25,42,60],right=True)
    data['Age_of_Driver_desc'] = ''
    data.loc[data['Age_of_Driver_bin'] == 0, 'Age_of_Driver_desc'] = '1-16 years'
    data.loc[data['Age_of_Driver_bin'] == 1, 'Age_of_Driver_desc'] = '17-25 years'
    data.loc[data['Age_of_Driver_bin'] == 2, 'Age_of_Driver_desc'] = '25-42 years'
    data.loc[data['Age_of_Driver_bin'] == 3, 'Age_of_Driver_desc'] = '42-60 years'
    data.loc[data['Age_of_Driver_bin'] == 4, 'Age_of_Driver_desc'] = '60+ years'

    #deal with nan
    data.loc[data['Age_of_Driver'].isnull(),'Age_of_Driver_bin'] = -1
    data.loc[data['Age_of_Driver'].isnull(),'Age_of_Driver_desc'] = 'Unknown'
    return data

In [0]:
def normalizePropulsionCode(propCode):
    if (propCode != propCode): #nan
        return propCode

    if (propCode in 'Petrol'):
        return 'Petrol'

    if (propCode in 'Heavy oil'):
        return 'Heavy oil'

    return 'Other'

In [0]:
def plotSeverityByIndependentVariable(data,dependentVariable,labelColumn='',yscale='log',tickStart = 0,setTicks=True):

    if labelColumn == '':
        severityByVariable = data.groupby([dependentVariable,'Accident_Severity_Value']).size().to_frame('Count').reset_index()
    else:
        severityByVariable = data.groupby([dependentVariable,'Accident_Severity_Value',labelColumn]).size().to_frame('Count').reset_index()

    severityByVariable = severityByVariable.sort_values(by=[dependentVariable,'Accident_Severity_Value'],ascending=True)

    fig, axs = plt.subplots(1,3,figsize=(25, 2),sharey=True)
    plt.xticks(rotation=60)
    plt.subplots_adjust(wspace=0.1, hspace=0.1)
    fig.suptitle(f'% accidents by severity and {dependentVariable}', y=1.2,fontsize=15)

    plt.yscale(yscale)
    for count in range(0,3):
        x = severityByVariable.loc[(severityByVariable['Accident_Severity_Value']==count+1)]

        x = x.sort_values(by=dependentVariable)
        y = x['Count']/sum(x['Count'])

        
        axs[count].bar(x[dependentVariable],y)

        if labelColumn == '':
            labels = x[dependentVariable].unique().astype('str')
        else:
            labels = x[labelColumn]

        axs[count].grid(True)
        axs[count].set(ylabel='% accidents', title=f'Severity {count+1}')

        if setTicks == True:
            axs[count].set_xticks(range(tickStart,1+len(labels)))
            axs[count].set_xticklabels(labels, rotation=30, ha='right')

    return None


In [0]:
def clusterBySeverityUsingKMeans(data,independentVar,nClustersFinal=5,nClustersElbow=8):
    X_kmeans = data[[independentVar,'Accident_Severity_Value']]

    #Train anc plot elbow
    wcss = []
    for i in range(1, nClustersElbow):
        kmeans = KMeans(n_clusters=i, init='k-means++', random_state=0)
        kmeans.fit(X_kmeans)
        wcss.append(kmeans.inertia_)

    fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14, 2),sharey=False)
    ax1.plot(range(1, nClustersElbow), wcss)
    ax1.set_xticks(range(0,nClustersElbow));
    ax1.set(ylabel='', title=f'K-means Elbow: {independentVar} vs Severity')

    #Train anc plot clusters
    kmeans = KMeans(n_clusters=nClustersFinal, init='k-means++', random_state=0)
    kmeans.fit(X_kmeans)

    labelRange = len(data[independentVar].unique())
    # ax2.set_xticks(range(0,labelRange));
    ax2.scatter(X_kmeans[independentVar], X_kmeans['Accident_Severity_Value'], c=kmeans.labels_)
    ax1.set(ylabel='WCSS', title=f'K-means clusters: {independentVar} vs Severity')

    plt.show()
None


In [0]:

def plotSeverityRateByIndependentVariable(validData,independentVar,labelColumn='',yscale='log',tickStart=0,h=3,asp=2):
    yscale = 'log'
    totalNaN = data[independentVar].isnull().sum()

    if (totalNaN > 0):
        total = data[independentVar].count()
        print(f'WARN: {independentVar} has {totalNaN} NaN values. Percentage: %0.1f' % (totalNaN/total*100))  

    if labelColumn == '':
        independentVarData = validData[[independentVar,'Accident_Severity_Value']].groupby([independentVar,'Accident_Severity_Value']).size().to_frame('Count').reset_index()
        independentVarData = independentVarData.sort_values(by=[independentVar,'Accident_Severity_Value'])
    else:
        independentVarData = validData[[independentVar,'Accident_Severity_Value',labelColumn]].groupby([independentVar,'Accident_Severity_Value',labelColumn]).size().to_frame('Count').reset_index()
        independentVarData = independentVarData.sort_values(by=[independentVar,'Accident_Severity_Value'])

    totalByCategory = independentVarData.groupby(independentVar).sum()['Count'].reset_index().rename(columns={'Count':'Total'})
    independentVarData = pd.merge(independentVarData,totalByCategory,on=independentVar)
    independentVarData['SeverityRate'] = independentVarData['Count'] / independentVarData['Total']

    if labelColumn == '':
            labels = independentVarData[independentVar].unique()
    else:
            labels = independentVarData[labelColumn].unique()

    g = sns.catplot(x=independentVar, y="SeverityRate", hue="Accident_Severity_Value", data=independentVarData,height=h, aspect=asp,kind="bar", palette="muted")
    g.fig.suptitle(f'Accidents rate by severity and {independentVar} - {yscale} scale', y=1,fontsize=15)
    axs = g.fig.get_axes()[0]
    axs.set_yscale(yscale)
    axs.set_xticklabels(labels,rotation=45,ha='right')

    axs.grid(False)
    # # axs.set(ylabel's', title=f'SeveritRate1}')
    axs.set_xticks(range(tickStart,1+len(labels)))
    axs.set_xticklabels(labels, rotation=30, ha='right')
    plt.show()

    yscale = 'linear'    
    g = sns.catplot(x=independentVar, y="SeverityRate", hue="Accident_Severity_Value", data=independentVarData,height=h, aspect=asp,kind="bar", palette="muted")
    g.fig.suptitle(f'Accidents rate by severity and {independentVar} - {yscale} scale', y=1,fontsize=15)
    axs = g.fig.get_axes()[0]
    axs.set_yscale(yscale)
    axs.set_xticklabels(labels,rotation=45,ha='right')

    axs.grid(False)
    # # axs.set(ylabel's', title=f'SeveritRate1}')
    axs.set_xticks(range(tickStart,1+len(labels)))
    axs.set_xticklabels(labels, rotation=30, ha='right')
    plt.show()
    return None