In [87]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

In [88]:
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, MiniBatchKMeans

In [89]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, SMOTENC
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight

In [90]:
df = pd.read_csv('../data/master_modeling_years.csv')
mb_stations = pd.read_csv('../data/estaciones-metrobus0.csv')
df.head(3)

Unnamed: 0,mes_hechos,fecha_hechos,delito,categoria_delito,colonia_hechos,alcaldia_hechos,fecha_inicio,calle_hechos,longitud,latitud,...,rob_victim,event_time,event_date,Linea,Afluencia,target_crimes,dist_km,manhattan_dist,geo_hash_crime,geo_hash_mb
0,Mayo,2018-05-21 20:40:00,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,DELITO DE BAJO IMPACTO,INFONAVIT IZTACALCO,IZTACALCO,2018-05-22 10:52:18,FRANCISCO DEL PASO Y TRONCOSO,-99.111563,19.386594,...,0,20:40:00,2018-05-21,5,89793,0,0.03,0.040434,9g3w29,9g3w29
1,Enero,2016-01-02 21:20:00,ROBO DE DINERO,DELITO DE BAJO IMPACTO,LOS REYES,IZTACALCO,2016-01-05 19:37:09,AV PLUTARCO ELIAS CALLES,-99.117183,19.39804,...,0,21:20:00,2016-01-02,2,76570,0,0.05,0.073859,9g3w2e,9g3w2e
2,Octubre,2017-10-15 12:00:00,ROBO DE OBJETOS,DELITO DE BAJO IMPACTO,MERCED BALBUENA,VENUSTIANO CARRANZA,2017-11-01 22:12:42,ANILLO DE CIRCUNVALACION,-99.124966,19.428301,...,0,12:00:00,2017-10-15,4,31720,0,0.13,0.134643,9g3w82,9g3w82


In [91]:
len(df.columns)

23

In [92]:
df.dtypes

mes_hechos           object
fecha_hechos         object
delito               object
categoria_delito     object
colonia_hechos       object
alcaldia_hechos      object
fecha_inicio         object
calle_hechos         object
longitud            float64
latitud             float64
Geopoint             object
nearest_mb           object
geo_mb               object
rob_victim            int64
event_time           object
event_date           object
Linea                 int64
Afluencia             int64
target_crimes         int64
dist_km             float64
manhattan_dist      float64
geo_hash_crime       object
geo_hash_mb          object
dtype: object

In [93]:
df = df[['fecha_hechos','longitud', 'latitud',  'delito','nearest_mb']]
df.rename(columns={"nearest_mb": "nombre"}, inplace=True)
df.shape

(25586, 5)

In [94]:
mb_stations.dtypes

Geo Point    object
Geo Shape    object
nombre       object
linea         int64
dtype: object

In [95]:
df = pd.merge(df, mb_stations)
df.shape

(29828, 8)

In [96]:
# Repeated instances because stations share lines
df.head()

Unnamed: 0,fecha_hechos,longitud,latitud,delito,nombre,Geo Point,Geo Shape,linea
0,2018-05-21 20:40:00,-99.111563,19.386594,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,Colegio de Bachilleres,"19.38632392,-99.1116619153","{""type"": ""Point"", ""coordinates"": [-99.11166191...",5
1,2017-08-22 15:00:00,-99.111824,19.387331,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,Colegio de Bachilleres,"19.38632392,-99.1116619153","{""type"": ""Point"", ""coordinates"": [-99.11166191...",5
2,2017-09-12 20:55:00,-99.11167,19.386555,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,Colegio de Bachilleres,"19.38632392,-99.1116619153","{""type"": ""Point"", ""coordinates"": [-99.11166191...",5
3,2018-12-29 19:50:00,-99.112752,19.386661,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,Colegio de Bachilleres,"19.38632392,-99.1116619153","{""type"": ""Point"", ""coordinates"": [-99.11166191...",5
4,2018-11-28 17:30:00,-99.113661,19.386001,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,Colegio de Bachilleres,"19.38632392,-99.1116619153","{""type"": ""Point"", ""coordinates"": [-99.11166191...",5


In [97]:
df['latitud_mb'] = df['Geo Point'].apply(lambda x: x.split(',')[0])
df['longitud_mb'] = df['Geo Point'].apply(lambda x: x.split(',')[1])
df.head(3)

Unnamed: 0,fecha_hechos,longitud,latitud,delito,nombre,Geo Point,Geo Shape,linea,latitud_mb,longitud_mb
0,2018-05-21 20:40:00,-99.111563,19.386594,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,Colegio de Bachilleres,"19.38632392,-99.1116619153","{""type"": ""Point"", ""coordinates"": [-99.11166191...",5,19.38632392,-99.1116619153
1,2017-08-22 15:00:00,-99.111824,19.387331,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,Colegio de Bachilleres,"19.38632392,-99.1116619153","{""type"": ""Point"", ""coordinates"": [-99.11166191...",5,19.38632392,-99.1116619153
2,2017-09-12 20:55:00,-99.11167,19.386555,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,Colegio de Bachilleres,"19.38632392,-99.1116619153","{""type"": ""Point"", ""coordinates"": [-99.11166191...",5,19.38632392,-99.1116619153


In [98]:
df = df.drop(columns=['Geo Shape', 'nombre', 'Geo Point', 'linea'])
df.head(3)

Unnamed: 0,fecha_hechos,longitud,latitud,delito,latitud_mb,longitud_mb
0,2018-05-21 20:40:00,-99.111563,19.386594,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,19.38632392,-99.1116619153
1,2017-08-22 15:00:00,-99.111824,19.387331,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,19.38632392,-99.1116619153
2,2017-09-12 20:55:00,-99.11167,19.386555,ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA,19.38632392,-99.1116619153


## Pre-processing

In [99]:
# Setting GLOBAL var: other_dic 
# Keeps track of all the list of non_significant categories
other_dic = {}

In [100]:
# Function taken from the following resource [URL]: 
# https://stackoverflow.com/questions/18016495/get-subset-of-most-frequent-dummy-variables-in-pandas
def dummy_best(dummy_col, threshold=0.0001):
        '''
            Function return a dummifed dataframe of significant dummies
            in a given column
        '''
        dummy_columns = dummy_col.copy()
        # What is the ratio of a dummy in whole column
        count = pd.value_counts(dummy_columns) / len(dummy_columns)
        # filtering based on the ratios according to a threshold
        mask = dummy_columns.isin(count[count>threshold].index)
        # replace the ones which ratio is lower than the threshold 
        other_dic[dummy_columns.name] = list(dummy_columns[~mask])
        dummy_columns[~mask] = 'others'
        return pd.get_dummies(dummy_columns, prefix=dummy_columns.name)    

In [101]:
def data_merger(main_df, features):
    merged_df = main_df.copy()
    for element in features:
        merged_df = pd.concat([merged_df, dummy_best(main_df[element])], axis=1)
    return merged_df

In [102]:
df['delito'].value_counts(normalize=True)

ROBO DE OBJETOS                                                0.189486
ROBO A NEGOCIO SIN VIOLENCIA                                   0.169807
ROBO A TRANSEUNTE DE CELULAR SIN VIOLENCIA                     0.147613
ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA                     0.110366
ROBO DE ACCESORIOS DE AUTO                                     0.090385
ROBO DE OBJETOS DEL INTERIOR DE UN VEHICULO                    0.071812
ROBO A CASA HABITACION SIN VIOLENCIA                           0.041974
ROBO DE VEHICULO DE PEDALES                                    0.033257
ROBO A PASAJERO A BORDO DE METROBUS SIN VIOLENCIA              0.030844
ROBO A PASAJERO / CONDUCTOR DE VEHICULO CON VIOLENCIA          0.025479
ROBO DE DINERO                                                 0.016159
ROBO DE DOCUMENTOS                                             0.015355
ROBO A PASAJERO A BORDO DE TRANSPORTE PÚBLICO CON VIOLENCIA    0.012874
ROBO A PASAJERO A BORDO DE PESERO COLECTIVO CON VIOLENCIA      0

In [103]:
def significance_filter(zip_element):
    significant = []
    not_significant = []
    for i in zip_element:
        if i[1] >=.1:
            significant.append(i[0])
        else:
            not_significant.append(i[0])
    return significant, not_significant

In [104]:
df.shape

(29828, 6)

In [105]:
count = pd.value_counts(df['delito']) / len(df['delito'])
sig, n_sig = significance_filter(list(zip(count.index.values, count)))

In [106]:
df['delito'] = df['delito'].apply(lambda x: None if x in n_sig else x)
df = df.dropna()
df.shape

(18412, 6)

In [107]:
# Label Ecoder for any feature with imporance of order
# based of the percentage of presence of category in column.
def feature_encoder(zip_element, var_dict):
    val_econded = len(zip_element)
    for i in zip_element:
        var_dict[i[0]] = val_econded
        val_econded -= 1

In [108]:
df['delito'].value_counts(normalize=True)

ROBO DE OBJETOS                               0.306974
ROBO A NEGOCIO SIN VIOLENCIA                  0.275092
ROBO A TRANSEUNTE DE CELULAR SIN VIOLENCIA    0.239138
ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA    0.178796
Name: delito, dtype: float64

In [109]:
# Fruther cleaning filtering
df['delito'] = df['delito'].apply(lambda x: None if x in ['ROBO A NEGOCIO SIN VIOLENCIA','ROBO DE ACCESORIOS DE AUTO'] else x)
df = df.dropna()
df.shape

(13347, 6)

In [110]:
count = pd.value_counts(df['delito']) / len(df['delito'])
list(zip(count.index.values, count))

[('ROBO DE OBJETOS', 0.42346594740391097),
 ('ROBO A TRANSEUNTE DE CELULAR SIN VIOLENCIA', 0.32988686596238853),
 ('ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA', 0.24664718663370044)]

In [111]:
encoder_robo_dict = {}
feature_encoder((list(zip(count.index.values, count))), encoder_robo_dict)
encoder_robo_dict

{'ROBO DE OBJETOS': 3,
 'ROBO A TRANSEUNTE DE CELULAR SIN VIOLENCIA': 2,
 'ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA': 1}

In [112]:
df['delito_enco'] = df['delito'].apply(lambda x: encoder_robo_dict[x]).copy()

In [113]:
df['delito'].value_counts(normalize=True)

ROBO DE OBJETOS                               0.423466
ROBO A TRANSEUNTE DE CELULAR SIN VIOLENCIA    0.329887
ROBO A TRANSEUNTE DE CELULAR CON VIOLENCIA    0.246647
Name: delito, dtype: float64

In [114]:
df = df.drop(columns=['delito', 'fecha_hechos'])

In [115]:
df.head()

Unnamed: 0,longitud,latitud,latitud_mb,longitud_mb,delito_enco
0,-99.111563,19.386594,19.38632392,-99.1116619153,1
1,-99.111824,19.387331,19.38632392,-99.1116619153,1
2,-99.11167,19.386555,19.38632392,-99.1116619153,1
3,-99.112752,19.386661,19.38632392,-99.1116619153,1
4,-99.113661,19.386001,19.38632392,-99.1116619153,1


### K-means

In [116]:
X = df.drop(columns=['delito_enco'])
y = df['delito_enco']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [161]:
coords = np.vstack((X_train[['latitud', 'longitud']].values,
                    X_train[['latitud_mb', 'longitud_mb']].values,
                    X_test[['latitud', 'longitud']].values,
                    X_test[['latitud_mb', 'longitud_mb']].values))
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

In [162]:
X_train.loc[:, 'la_c'] = kmeans.predict(X_train[['latitud', 'longitud']])
X_train.loc[:, 'lon_c'] = kmeans.predict(X_train[['latitud_mb', 'longitud_mb']])
X_test.loc[:, 'la_c'] = kmeans.predict(X_test[['latitud', 'longitud']])
X_test.loc[:, 'lon_c'] = kmeans.predict(X_test[['latitud_mb', 'longitud_mb']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [163]:
X_train.head()

Unnamed: 0,longitud,latitud,latitud_mb,longitud_mb,la_c,lon_c
5984,-99.113681,19.419295,19.4183741045,-99.1122187196,89,89
18661,-99.147815,19.438779,19.4360208606,-99.147228265,3,3
20473,-99.11041,19.381415,19.3828869839,-99.1108887675,56,56
27860,-99.162306,19.519108,19.5196884339,-99.1637719724,99,99
27347,-99.094937,19.477645,19.4769982194,-99.0948748843,1,1


In [164]:
X_train.sort_index()

Unnamed: 0,longitud,latitud,latitud_mb,longitud_mb,la_c,lon_c
0,-99.111563,19.386594,19.38632392,-99.1116619153,56,56
2,-99.111670,19.386555,19.38632392,-99.1116619153,56,56
3,-99.112752,19.386661,19.38632392,-99.1116619153,56,56
5,-99.113596,19.386798,19.38632392,-99.1116619153,56,56
7,-99.111478,19.386590,19.38632392,-99.1116619153,56,56
...,...,...,...,...,...,...
29805,-99.152312,19.463312,19.4630960371,-99.1526627713,94,94
29808,-99.150763,19.465095,19.4630960371,-99.1526627713,94,94
29810,-99.150412,19.462736,19.4630960371,-99.1526627713,94,94
29814,-99.191824,19.508668,19.5109448168,-99.1926053403,15,15


In [165]:
df.head()

Unnamed: 0,longitud,latitud,latitud_mb,longitud_mb,delito_enco
0,-99.111563,19.386594,19.38632392,-99.1116619153,1
1,-99.111824,19.387331,19.38632392,-99.1116619153,1
2,-99.11167,19.386555,19.38632392,-99.1116619153,1
3,-99.112752,19.386661,19.38632392,-99.1116619153,1
4,-99.113661,19.386001,19.38632392,-99.1116619153,1


In [166]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y),y)
class_weights_dic = { i+1:class_weights[i] for i in range(len(class_weights))}
class_weights_dic
    

1        1
2        1
3        1
4        1
        ..
29808    3
29810    2
29813    2
29814    3
29821    3
Name: delito_enco, Length: 13347, dtype: int64 as keyword args. From version 0.25 passing these as positional arguments will result in an error


{1: 1.3514580801944107, 2: 1.010447422212128, 3: 0.7871549893842887}

In [167]:
pipe = Pipeline([
        ('scale', StandardScaler()),
        ('lr', LogisticRegression(max_iter=1000, class_weight=class_weights_dic))
    ])

In [180]:
pipe_params = {
    'lr__C': [0.1, 0.3, 0.5, 1]
}
grid = GridSearchCV(pipe, pipe_params, scoring = 'recall_micro')

In [181]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('lr',
                                        LogisticRegression(class_weight={1: 1.3514580801944107,
                                                                         2: 1.010447422212128,
                                                                         3: 0.7871549893842887},
                                                           max_iter=1000))]),
             param_grid={'lr__C': [0.1, 0.3, 0.5, 1]}, scoring='recall_micro')

In [182]:
grid.score(X_train, y_train)

0.41228771228771227

In [183]:
grid.score(X_test, y_test)

0.41624213365298174