In [25]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /Users/beastmode/opt/anaconda3/lib/python3.7/site-packages (0.0)


In [26]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
cdf = pd.read_csv('mach_learn_df.csv')

cdf = cdf.drop(columns=['Unnamed: 0','City'])

cdf.head()

Unnamed: 0,Hour,Offense_Type,Premise,Tract,Month,Day_of_Week,Temperature,Weather
0,0,Burglary/Robbery,Residence or House,313100,1,Monday,46.56,Clouds
1,0,Burglary/Robbery,Apartment,321300,1,Monday,46.56,Clouds
2,0,Burglary/Robbery,"Road, Street, or Sidewalk",432801,1,Monday,46.56,Clouds
3,0,Assault,"Road, Street, or Sidewalk",330700,1,Monday,46.56,Clouds
4,0,Theft,"Church, Synagogue, or Temple Parking Lot",312800,1,Monday,46.56,Clouds


# Select your features (columns)

In [3]:
# Assign X (data) and y (target)
X = cdf.drop(["Offense_Type"], axis=1)
y = cdf["Offense_Type"]
print(X.shape, y.shape)

(309959, 7) (309959,)


In [4]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encodery = LabelEncoder()
encoded_y = label_encodery.fit(y)
transformed_y = label_encodery.transform(y)

In [5]:
transformed_y

array([2, 2, 2, ..., 7, 3, 7])

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    
encoded_X = MultiColumnLabelEncoder(columns = ['Day_of_Week', 'Premise', 'Tract', 'Weather']).fit_transform(X)

encoded_X

Unnamed: 0,Hour,Premise,Tract,Month,Day_of_Week,Temperature,Weather
0,0,125,156,1,1,46.56,1
1,0,9,180,1,1,46.56,1
2,0,131,385,1,1,46.56,1
3,0,131,204,1,1,46.56,1
4,0,33,153,1,1,46.56,1
...,...,...,...,...,...,...,...
309954,23,126,491,5,3,74.26,6
309955,23,126,486,5,3,74.26,6
309956,23,18,486,5,3,74.26,6
309957,23,126,361,5,3,74.26,6


# Create a Train Test Split

Use `koi_disposition` for the y values

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_X, transformed_y, random_state=1, stratify=y)

In [8]:
y_test

array([7, 7, 1, ..., 3, 1, 2])

In [9]:
X_train.head()

Unnamed: 0,Hour,Premise,Tract,Month,Day_of_Week,Temperature,Weather
103959,22,140,275,12,4,72.07,1
277814,17,126,317,3,5,77.31,1
252929,17,76,228,12,1,56.98,0
292483,14,126,400,4,1,53.78,0
41013,16,115,399,6,6,86.13,4


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [10]:
# Scale your data
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [11]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [12]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Create a random forest classifier
rf = RandomForestClassifier(n_estimators=100)
rf = rf.fit(X_train_scaled, y_train)

In [None]:
# print(f"Training Data Score: {rf.score(X_train, y_train)}")
print(f"Testing Data Score: {rf.score(X_test_scaled, y_test)}")

In [44]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf.feature_importances_

# We can sort the features by their importance
sorted(zip(rf.feature_importances_, X.columns), reverse=True)

[(0.30743217849066057, 'Tract'),
 (0.23254328351527553, 'Temperature'),
 (0.12727757314869925, 'Premise'),
 (0.12123794119853906, 'Hour'),
 (0.08545669663493666, 'Month'),
 (0.07302530121211499, 'Day_of_Week'),
 (0.053027025799773995, 'Weather')]

In [50]:
# from sklearn import preprocessing

# preprocessing.LabelEncoder().inverse_transform(rf.predict(X_train_scaled[:5]))

encoded_predictions = rf.predict(X_test_scaled[:5])
prediction_labels = label_encodery.inverse_transform(encoded_predictions)

In [51]:
prediction_labels

array(['Theft', 'Theft', 'Vandalism', 'Vandalism', 'Vandalism'],
      dtype=object)

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [26]:
# Create the GridSearchCV model
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [10, 50, 100, 150, 200, 250]}
grid = GridSearchCV(rf, param_grid, verbose=3)

In [27]:
# Train the model with GridSearch
# Fit the model using the grid search estimator. 
# This will take the model and try each combination of parameters
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_estimators=10 .................................................
[CV] ..................... n_estimators=10, score=0.425, total=   6.8s
[CV] n_estimators=10 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.8s remaining:    0.0s


[CV] ..................... n_estimators=10, score=0.427, total=   7.5s
[CV] n_estimators=10 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.4s remaining:    0.0s


[CV] ..................... n_estimators=10, score=0.423, total=   6.4s
[CV] n_estimators=50 .................................................
[CV] ..................... n_estimators=50, score=0.448, total=  38.0s
[CV] n_estimators=50 .................................................
[CV] ..................... n_estimators=50, score=0.446, total=  33.9s
[CV] n_estimators=50 .................................................
[CV] ..................... n_estimators=50, score=0.445, total=  36.3s
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.450, total= 1.3min
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.449, total= 1.1min
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.448, total= 1.1min
[CV] n_estimators=150 ................................................
[CV] .

KeyboardInterrupt: 

In [46]:
print(grid.best_params_)
print(grid.best_score_)

{'n_estimators': 150}
0.8958611481975968


# Save the Model

In [19]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import pickle
filename = 'crime_rand_forest.sav'
joblib.dump(rf, filename)

['crime_rand_forest.sav']

In [9]:
# import joblib
# from io import BytesIO
# import boto3
# import boto3.session
# from config import ACCESS_KEY, SECRET_KEY

# def read_joblib(path):
#     ''' 
#        Function to load a joblib file from an s3 bucket or local directory.
#        Arguments:
#        * path: an s3 bucket or local directory path where the file is stored
#        Outputs:
#        * file: Joblib file loaded
#     '''
    
# #     credentials = boto3.Session().get_credentials(),
# #     ACCESS_KEY = credentials.access_key,
# #     SECRET_KEY = credentials.secret_key

#     s3client = boto3.client('s3',
#                         aws_access_key_id = ACCESS_KEY, 
#                         aws_secret_access_key = SECRET_KEY 
# #                         aws_session_token = SESSION_TOKEN
#                        )
    
#     # Path is an s3 bucket
#     if path[:5] == 's3://':
#         s3_bucket, s3_key = path.split('/')[2], path.split('/')[3:]
#         s3_key = '/'.join(s3_key)
#         with BytesIO() as f:
#             s3client.download_fileobj(Bucket=s3_bucket, Key=s3_key, Fileobj=f)
#             f.seek(0)
#             file = joblib.load(f)
    
#     # Path is a local directory 
#     else:
#         with open(path, 'rb') as f:
#             file = joblib.load(f)
    
#     return file

# rand_model = read_joblib('s3://rand-forest-model/crime_rand_forest.sav')

ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden

In [28]:
print(f"Testing Data Score: {rand_model.score(X_test[:100], y_test[:100])}")

Testing Data Score: 0.43
