## Part 1: Load and examine the data

To run the code in the notebook cell, change the name of the data_bucket from 'machliba' to the data_bucket holding your data and click into the cell and press Ctrl+Enter.

In [None]:
dataset = 'activities.csv' 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv(f'./{dataset}')
df[5:8]

In [None]:
df['Error'].value_counts() # Display the number of error lines: False = no error. True = error.

In [None]:
print(f'Number of rows in dataset: {df.shape[0]}')
print()
print('Matter types:')
print(df['Matter Type'].value_counts())
print()
print('Resources:')
print(df['Resource'].value_counts())
print()
print('Activities:')
print(df['Activity'].value_counts())

## Part 2: Get the data into the right shape

In [None]:
encoded_df = pd.get_dummies(df, columns=['Matter Type','Resource','Activity']) 
encoded_df.head(3)

## Part 3: Create training and validation datasets

In [None]:
train_df, val_df, _, _ = train_test_split(encoded_df, encoded_df['Error'], test_size=0.2, random_state=0)
train_df_no_result = train_df.drop(['Error','Firm Name'], axis=1)
val_df_no_result = val_df.drop(['Error','Firm Name'], axis=1)
print(f'{train_df.shape[0]} rows in training data')
print(f'{val_df.shape[0]} rows in validation data')

## Part 4: Train the model



In [None]:
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings("ignore")

In [None]:
isolationforest = IsolationForest(n_estimators=50)

In [None]:
isolationforest.fit(train_df_no_result.values)

## Part 6: Test the model

In [None]:
predictions = isolationforest.predict(val_df_no_result.values)

In [None]:
scores = isolationforest.decision_function(val_df_no_result.values)

In [None]:
scores_df = pd.DataFrame({"score":scores})
val_df = val_df.reset_index(drop=True)
results_df = pd.concat([val_df, scores_df], axis=1)
results_df['Error'].value_counts()

In [None]:
score_cutoff = results_df[results_df['Error'] == True]['score'].median()
print(f'Score cutoff: {score_cutoff}')
results_above_cutoff = results_df[results_df['score'] > score_cutoff]
results_above_cutoff['Error'].value_counts()

In [None]:
results_df['Prediction'] = results_df['score'] > score_cutoff
results_df.head()