## Part 1: Load and examine the data

To run the code in the notebook cell, change the name of the data_bucket from 'machliba' to the data_bucket holding your data and click into the cell and press Ctrl+Enter.

In [26]:
dataset = 'activities.csv' 

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [28]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
df[5:8]

Unnamed: 0,Matter Number,Firm Name,Matter Type,Resource,Activity,Minutes,Fee,Total,Error
5,0,Cox Group,Antitrust,Paralegal,Attend Court,110,50,91.67,False
6,0,Cox Group,Antitrust,Junior,Attend Court,505,150,1262.5,True
7,0,Cox Group,Antitrust,Paralegal,Attend Meeting,60,50,50.0,False


In [29]:
df['Error'].value_counts() # Display the number of error lines: False = no error. True = error.

False    103935
True       2030
Name: Error, dtype: int64

In [30]:
print(f'Number of rows in dataset: {df.shape[0]}')
print()
print('Matter types:')
print(df['Matter Type'].value_counts())
print()
print('Resources:')
print(df['Resource'].value_counts())
print()
print('Activities:')
print(df['Activity'].value_counts())

Number of rows in dataset: 105965

Matter types:
Antitrust                 23922
Insolvency                16499
IPO                       14236
Commercial arbitration    12927
Project finance           11776
M&A                        6460
Structured finance         5498
Asset recovery             4913
Tax planning               4871
Securities litigation      4863
Name: Matter Type, dtype: int64

Resources:
Partner      26587
Junior       26543
Paralegal    26519
Senior       26316
Name: Resource, dtype: int64

Activities:
Prepare Opinion    26605
Phone Call         26586
Attend Court       26405
Attend Meeting     26369
Name: Activity, dtype: int64


## Part 2: Get the data into the right shape

In [31]:
encoded_df = pd.get_dummies(df, columns=['Matter Type','Resource','Activity']) 
encoded_df.head(3)

Unnamed: 0,Matter Number,Firm Name,Minutes,Fee,Total,Error,Matter Type_Antitrust,Matter Type_Asset recovery,Matter Type_Commercial arbitration,Matter Type_IPO,...,Matter Type_Structured finance,Matter Type_Tax planning,Resource_Junior,Resource_Paralegal,Resource_Partner,Resource_Senior,Activity_Attend Court,Activity_Attend Meeting,Activity_Phone Call,Activity_Prepare Opinion
0,0,Cox Group,85,70,99.17,False,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,Cox Group,505,150,1262.5,False,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,Cox Group,100,180,300.0,False,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1


## Part 3: Create training and validation datasets

In [32]:
train_df, val_df, _, _ = train_test_split(encoded_df, encoded_df['Error'], test_size=0.2, random_state=0)
train_df_no_result = train_df.drop(['Error','Firm Name'], axis=1)
val_df_no_result = val_df.drop(['Error','Firm Name'], axis=1)
print(f'{train_df.shape[0]} rows in training data')
print(f'{val_df.shape[0]} rows in validation data')

84772 rows in training data
21193 rows in validation data


## Part 4: Train the model



In [33]:
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings("ignore")

In [34]:
isolationforest = IsolationForest(n_estimators=50)

In [35]:
isolationforest.fit(train_df_no_result.values)

IsolationForest(behaviour='old', bootstrap=False, contamination='legacy',
                max_features=1.0, max_samples='auto', n_estimators=50,
                n_jobs=None, random_state=None, verbose=0, warm_start=False)

## Part 6: Test the model

In [36]:
predictions = isolationforest.predict(val_df_no_result.values)

In [37]:
scores = isolationforest.decision_function(val_df_no_result.values)

In [38]:
scores_df = pd.DataFrame({"score":scores})
val_df = val_df.reset_index(drop=True)
results_df = pd.concat([val_df, scores_df], axis=1)
results_df['Error'].value_counts()

False    20791
True       402
Name: Error, dtype: int64

In [39]:
score_cutoff = results_df[results_df['Error'] == True]['score'].median()
print(f'Score cutoff: {score_cutoff}')
results_above_cutoff = results_df[results_df['score'] > score_cutoff]
results_above_cutoff['Error'].value_counts()

Score cutoff: -0.03517825701605348


False    20721
True       201
Name: Error, dtype: int64

In [40]:
results_df['Prediction'] = results_df['score'] > score_cutoff
results_df.head()

Unnamed: 0,Matter Number,Firm Name,Minutes,Fee,Total,Error,Matter Type_Antitrust,Matter Type_Asset recovery,Matter Type_Commercial arbitration,Matter Type_IPO,...,Resource_Junior,Resource_Paralegal,Resource_Partner,Resource_Senior,Activity_Attend Court,Activity_Attend Meeting,Activity_Phone Call,Activity_Prepare Opinion,score,Prediction
0,0,Cox Group,100,50,83.33,False,1,0,0,0,...,0,1,0,0,0,0,0,1,0.015254,True
1,1986,Russo Ltd,20,80,26.67,False,0,0,0,0,...,0,1,0,0,0,0,1,0,0.018495,True
2,1254,"Peters, Nelson and Moyer",120,70,140.0,False,1,0,0,0,...,0,1,0,0,1,0,0,0,0.050401,True
3,1777,"Jackson, Pollard and Levy",25,180,75.0,False,0,0,0,0,...,1,0,0,0,0,0,1,0,0.015483,True
4,113,"Solis, Adams and Cooper",25,600,250.0,False,0,1,0,0,...,0,0,1,0,0,0,1,0,-0.001701,True
