In [190]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
import time

Now that you've learned about random forests and decision trees let's do an exercise in accuracy. You know that random forests are basically a collection of decision trees. But how do the accuracies of the two models compare?

So here's what you should do. Pick a dataset. It could be one you've worked with before or it could be a new one. Then build the best decision tree you can.

Now try to match that with the simplest random forest you can. For our purposes measure simplicity with runtime. Compare that to the runtime of the decision tree. This is imperfect but just go with it.

Hopefully out of this you'll see the power of random forests, but also their potential costs. Remember, in the real world you won't necessarily be dealing with thousands of rows. It could be millions, billions, or even more.

Submit a link to your models below.

## Load dataset

In [191]:
df = pd.read_csv('/Users/Stephanie/Downloads/crime-in-baltimore.zip')

In [192]:
df.head()

Unnamed: 0,CrimeDate,CrimeTime,CrimeCode,Location,Description,Inside/Outside,Weapon,Post,District,Neighborhood,Longitude,Latitude,Location 1,Premise,Total Incidents
0,09/02/2017,23:30:00,3JK,4200 AUDREY AVE,ROBBERY - RESIDENCE,I,KNIFE,913.0,SOUTHERN,Brooklyn,-76.60541,39.22951,"(39.2295100000, -76.6054100000)",ROW/TOWNHO,1
1,09/02/2017,23:00:00,7A,800 NEWINGTON AVE,AUTO THEFT,O,,133.0,CENTRAL,Reservoir Hill,-76.63217,39.3136,"(39.3136000000, -76.6321700000)",STREET,1
2,09/02/2017,22:53:00,9S,600 RADNOR AV,SHOOTING,Outside,FIREARM,524.0,NORTHERN,Winston-Govans,-76.60697,39.34768,"(39.3476800000, -76.6069700000)",Street,1
3,09/02/2017,22:50:00,4C,1800 RAMSAY ST,AGG. ASSAULT,I,OTHER,934.0,SOUTHERN,Carrollton Ridge,-76.64526,39.28315,"(39.2831500000, -76.6452600000)",ROW/TOWNHO,1
4,09/02/2017,22:31:00,4E,100 LIGHT ST,COMMON ASSAULT,O,HANDS,113.0,CENTRAL,Downtown West,-76.61365,39.28756,"(39.2875600000, -76.6136500000)",STREET,1


Clean dataset

In [193]:
len(df)

276529

In [194]:
# check nan values
len(df) - df.count()

CrimeDate               0
CrimeTime               0
CrimeCode               0
Location             2207
Description             0
Inside/Outside      10279
Weapon             180952
Post                  224
District               80
Neighborhood         2740
Longitude            2204
Latitude             2204
Location 1           2204
Premise             10757
Total Incidents         0
dtype: int64

In [195]:
# drop weapon column
df.drop('Weapon', 1, inplace=True)

In [196]:
df.dropna(inplace=True)
len(df)

263118

We're going to use only the 4 neighborhoods with more crime

In [197]:
top4 = df['CrimeDate'].groupby(df.Neighborhood).count().sort_values(ascending=False).head(4).index

In [198]:
sample = df[df['Neighborhood'].isin(top4)]

In [199]:
sample = sample.apply(lambda x: x.astype(str).str.lower())

In [200]:
categorical = sample.select_dtypes(include=['object'])
for i in categorical:
    column = categorical[i]
    print(i)
    print(column.nunique())

CrimeDate
2071
CrimeTime
1543
CrimeCode
72
Location
2055
Description
15
Inside/Outside
4
Post
14
District
3
Neighborhood
4
Longitude
4356
Latitude
3546
Location 1
7566
Premise
97
Total Incidents
1


We're going to some variables and reduce Crime Date and Crime Time to day of the week and day/night options

In [201]:
sample.drop(['Location', 'Location 1', 'Premise', 'Total Incidents'], 1, inplace=True)

In [202]:
sample.CrimeDate = pd.to_datetime(sample.CrimeDate,infer_datetime_format=True).dt.dayofweek
sample.CrimeTime = pd.to_datetime(sample.CrimeTime,infer_datetime_format=True).dt.hour

In [203]:
sample.CrimeTime = sample.CrimeTime.apply(lambda x: 'Night' if x <= 7 else 'Day' if x <19 else 'Night')

Let's unify Inside/Outside values

In [204]:
sample['Inside/Outside'].unique()

array(['i', 'o', 'outside', 'inside'], dtype=object)

In [205]:
sample['Inside/Outside'] = sample['Inside/Outside'].replace('i', 'inside')
sample['Inside/Outside'] = sample['Inside/Outside'].replace('o', 'outside')

In [206]:
sample['Inside/Outside'].unique()

array(['inside', 'outside'], dtype=object)

## Classify Crime Location (Inside or Outside) with Decision Tree
We will use Decision Trees and Random Forests to build a model that classifies as model as happened inside or outside

In [207]:
# data used
X = sample.drop('Inside/Outside', 1)
Y = sample['Inside/Outside']

In [208]:
X = pd.get_dummies(X)

In [209]:
# Initialize and train our tree.
d_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=5,
)

start_time = time.time()
score_dt = cross_val_score(d_tree, X, Y, cv=5).mean()
print("--- %s seconds ---" % (time.time() - start_time))

print(score_dt)

--- 47.03683304786682 seconds ---
0.5311280357183078


## Classify Crime Location (Inside or Outside) with Decision Tree

In [211]:
rfc = ensemble.RandomForestClassifier()

start_time = time.time()
score_rf = cross_val_score(rfc, X, Y, cv=5).mean()
print("--- %s seconds ---" % (time.time() - start_time))

print(score_rf)

--- 139.59245920181274 seconds ---
0.8216022739292794


Random Forest gets much better results :O