<a href="https://colab.research.google.com/github/giobritos/soul_code_projeto_final/blob/main/Import_Originais_NY.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![NY](https://storage.googleapis.com/projeto-final-agsw/Imagens/novayorkpd.jpeg)

# 👮 **Importação da Base Original de Nova York**

Os dados escolhidos estão disponíveis no site **NYC Open Data**, administrado pelos órgão públicos da cidade de **Nova York**, que fica nos **Estados Unidos da América**. Você tem acesso a eles [neste link](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Historic/qgea-i56i) para consulta e download publicamente. 

Este conjunto de dados inclui todos os crimes válidos, contravenções e violações relatados ao **Departamento de Polícia da Cidade de Nova York** (NYPD) de 2006 até o final de 2021, para nosso estudo, utilizaremos do ano de **2018** até **2021**.

Link para a documentação completa do projeto: [clique aqui](https://www.notion.so/Mindful-Data-Consultoria-d495964059c34acb9c8fe3d21dbb5cf6)

## **📚 1 - Instalação e importação das bibliotecas**

In [None]:
# Instalador gcs
%%capture
%pip install gcsfs

# Conector MySQL
%pip install mysql-connector-python
%pip install PyMySQL

In [None]:
# install dependencies
import sys
!{sys.executable} -m pip install cloud-sql-python-connector["pymysql"] SQLAlchemy

In [None]:
# Conexão do Storage
from google.cloud import storage

# Importa sistema operacional
import os

# Conector do MySQL
from google.cloud.sql.connector import Connector
import sqlalchemy

# Pandas
import pandas as pd

# setando configurações de display do pandas
pd.set_option('display.max_columns',100)

## 🔗 **2 - Conexão com o DataLake**

In [None]:
# Configuração da chave de segurança
serviceAccount = '/content/projeto-final-373521-25961e56ca37.json'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = serviceAccount

# Fazer acesso ao bucket do Cloud Storage
client = storage.Client()

# Criar uma variável para receber o nome do bucket
bucket = client.get_bucket('projeto-final-agsw')

## ⚒️📑 **3 - Extração e padronização das bases de dados**

In [None]:
# Carregando o df pelo Pandas via GCS
df = pd.read_csv('gs://datasets-originais-agsw/ny/NYPD_Complaint_Data_Historic.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Visualizando o df
df.head(5)

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,JURISDICTION_CODE,PARKS_NM,HADEVELOPT,HOUSING_PSA,X_COORD_CD,Y_COORD_CD,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,506547392,03/29/2018,20:30:00,,,32.0,03/30/2018,351,CRIMINAL MISCHIEF & RELATED OF,254.0,"MISCHIEF, CRIMINAL 4, OF MOTOR",COMPLETED,MISDEMEANOR,MANHATTAN,FRONT OF,PARKING LOT/GARAGE (PUBLIC),N.Y. POLICE DEPT,0.0,,,,1000565.0,234704.0,,,,,40.810877,-73.941064,"(40.810877241, -73.941064151)",PATROL BORO MAN NORTH,,25-44,WHITE,F
1,629632833,02/06/2018,23:15:00,,,52.0,02/07/2018,341,PETIT LARCENY,333.0,"LARCENY,PETIT FROM STORE-SHOPL",COMPLETED,MISDEMEANOR,BRONX,INSIDE,DEPARTMENT STORE,N.Y. POLICE DEPT,0.0,,,,1009690.0,257590.0,45-64,BLACK,F,,40.873671,-73.908014,"(40.873671035, -73.908013649)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,D
2,787203902,11/21/2018,00:15:00,11/21/2018,00:20:00,75.0,11/21/2018,341,PETIT LARCENY,321.0,"LARCENY,PETIT FROM AUTO",COMPLETED,MISDEMEANOR,BROOKLYN,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1016034.0,176756.0,25-44,WHITE HISPANIC,F,,40.651782,-73.885457,"(40.651782232, -73.885456761)",PATROL BORO BKLYN NORTH,,UNKNOWN,UNKNOWN,D
3,280364018,06/09/2018,21:42:00,06/09/2018,21:43:00,10.0,06/10/2018,361,OFF. AGNST PUB ORD SENSBLTY &,639.0,AGGRAVATED HARASSMENT 2,COMPLETED,MISDEMEANOR,MANHATTAN,INSIDE,RESIDENCE - APT. HOUSE,N.Y. POLICE DEPT,0.0,,,,985717.0,215911.0,25-44,WHITE HISPANIC,M,,40.75931,-73.994706,"(40.759310399, -73.994706072)",PATROL BORO MAN SOUTH,,18-24,WHITE HISPANIC,F
4,985800320,11/10/2018,19:40:00,11/10/2018,19:45:00,19.0,11/10/2018,341,PETIT LARCENY,333.0,"LARCENY,PETIT FROM STORE-SHOPL",COMPLETED,MISDEMEANOR,MANHATTAN,INSIDE,CHAIN STORE,N.Y. POLICE DEPT,0.0,,,,992359.0,217816.0,<18,BLACK HISPANIC,F,,40.764536,-73.970728,"(40.764535539, -73.970728388)",PATROL BORO MAN NORTH,,UNKNOWN,UNKNOWN,D


In [None]:
# Checando os tipos de dados
df.dtypes

CMPLNT_NUM             int64
CMPLNT_FR_DT          object
CMPLNT_FR_TM          object
CMPLNT_TO_DT          object
CMPLNT_TO_TM          object
ADDR_PCT_CD          float64
RPT_DT                object
KY_CD                  int64
OFNS_DESC             object
PD_CD                float64
PD_DESC               object
CRM_ATPT_CPTD_CD      object
LAW_CAT_CD            object
BORO_NM               object
LOC_OF_OCCUR_DESC     object
PREM_TYP_DESC         object
JURIS_DESC            object
JURISDICTION_CODE    float64
PARKS_NM              object
HADEVELOPT            object
HOUSING_PSA           object
X_COORD_CD           float64
Y_COORD_CD           float64
SUSP_AGE_GROUP        object
SUSP_RACE             object
SUSP_SEX              object
TRANSIT_DISTRICT     float64
Latitude             float64
Longitude            float64
Lat_Lon               object
PATROL_BORO           object
STATION_NAME          object
VIC_AGE_GROUP         object
VIC_RACE              object
VIC_SEX       

### ☑️ **Filtrando Data Frame**

In [None]:
# Analisando os tipos de crimes presentes
pd.unique(df['OFNS_DESC'])

array(['CRIMINAL MISCHIEF & RELATED OF', 'PETIT LARCENY',
       'OFF. AGNST PUB ORD SENSBLTY &', 'FORGERY', 'DANGEROUS WEAPONS',
       'ASSAULT 3 & RELATED OFFENSES', 'FELONY ASSAULT', 'GRAND LARCENY',
       'OFFENSES AGAINST PUBLIC ADMINI', 'ROBBERY', 'HARRASSMENT 2',
       'DANGEROUS DRUGS', 'SEX CRIMES', 'VEHICLE AND TRAFFIC LAWS',
       'OTHER OFFENSES RELATED TO THEF', 'CRIMINAL TRESPASS',
       'OFFENSES AGAINST THE PERSON', 'BURGLARY',
       'POSSESSION OF STOLEN PROPERTY', 'MISCELLANEOUS PENAL LAW',
       'ARSON', 'GRAND LARCENY OF MOTOR VEHICLE', 'RAPE',
       'OFFENSES INVOLVING FRAUD', 'INTOXICATED & IMPAIRED DRIVING',
       'UNAUTHORIZED USE OF A VEHICLE', 'NYS LAWS-UNCLASSIFIED FELONY',
       'OTHER STATE LAWS (NON PENAL LA', 'FRAUDS', 'THEFT-FRAUD',
       'ADMINISTRATIVE CODE', nan, 'OFFENSES AGAINST PUBLIC SAFETY',
       'DISORDERLY CONDUCT', 'ANTICIPATORY OFFENSES', 'JOSTLING',
       'KIDNAPPING & RELATED OFFENSES', 'MURDER & NON-NEGL. MANSLAUGHTER',
     

**Tipos de crimes que iremos trabalhar:**


---


* MURDER & NON-NEGL. MANSLAUGHTER

* HOMICIDE-NEGLIGENT,UNCLASSIFIE

* HOMICIDE-NEGLIGENT-VEHICLE

In [None]:
# Filtrando dados nulos da coluna OFNS_DESC(descrição da ocorrência)
ft_null1 = df['OFNS_DESC'].notnull()

df = df.loc[ft_null1]
df

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,JURISDICTION_CODE,PARKS_NM,HADEVELOPT,HOUSING_PSA,X_COORD_CD,Y_COORD_CD,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,506547392,03/29/2018,20:30:00,,,32.0,03/30/2018,351,CRIMINAL MISCHIEF & RELATED OF,254.0,"MISCHIEF, CRIMINAL 4, OF MOTOR",COMPLETED,MISDEMEANOR,MANHATTAN,FRONT OF,PARKING LOT/GARAGE (PUBLIC),N.Y. POLICE DEPT,0.0,,,,1000565.0,234704.0,,,,,40.810877,-73.941064,"(40.810877241, -73.941064151)",PATROL BORO MAN NORTH,,25-44,WHITE,F
1,629632833,02/06/2018,23:15:00,,,52.0,02/07/2018,341,PETIT LARCENY,333.0,"LARCENY,PETIT FROM STORE-SHOPL",COMPLETED,MISDEMEANOR,BRONX,INSIDE,DEPARTMENT STORE,N.Y. POLICE DEPT,0.0,,,,1009690.0,257590.0,45-64,BLACK,F,,40.873671,-73.908014,"(40.873671035, -73.908013649)",PATROL BORO BRONX,,UNKNOWN,UNKNOWN,D
2,787203902,11/21/2018,00:15:00,11/21/2018,00:20:00,75.0,11/21/2018,341,PETIT LARCENY,321.0,"LARCENY,PETIT FROM AUTO",COMPLETED,MISDEMEANOR,BROOKLYN,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1016034.0,176756.0,25-44,WHITE HISPANIC,F,,40.651782,-73.885457,"(40.651782232, -73.885456761)",PATROL BORO BKLYN NORTH,,UNKNOWN,UNKNOWN,D
3,280364018,06/09/2018,21:42:00,06/09/2018,21:43:00,10.0,06/10/2018,361,OFF. AGNST PUB ORD SENSBLTY &,639.0,AGGRAVATED HARASSMENT 2,COMPLETED,MISDEMEANOR,MANHATTAN,INSIDE,RESIDENCE - APT. HOUSE,N.Y. POLICE DEPT,0.0,,,,985717.0,215911.0,25-44,WHITE HISPANIC,M,,40.759310,-73.994706,"(40.759310399, -73.994706072)",PATROL BORO MAN SOUTH,,18-24,WHITE HISPANIC,F
4,985800320,11/10/2018,19:40:00,11/10/2018,19:45:00,19.0,11/10/2018,341,PETIT LARCENY,333.0,"LARCENY,PETIT FROM STORE-SHOPL",COMPLETED,MISDEMEANOR,MANHATTAN,INSIDE,CHAIN STORE,N.Y. POLICE DEPT,0.0,,,,992359.0,217816.0,<18,BLACK HISPANIC,F,,40.764536,-73.970728,"(40.764535539, -73.970728388)",PATROL BORO MAN NORTH,,UNKNOWN,UNKNOWN,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7825494,104527061,05/01/2008,18:55:00,05/01/2018,19:00:00,10.0,05/01/2018,235,DANGEROUS DRUGS,511.0,"CONTROLLED SUBSTANCE, POSSESSI",COMPLETED,MISDEMEANOR,MANHATTAN,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,985209.0,214992.0,45-64,WHITE HISPANIC,F,,40.756788,-73.996540,"(40.756788048, -73.996539912)",PATROL BORO MAN SOUTH,,UNKNOWN,UNKNOWN,E
7825495,284201488,05/18/2018,14:50:00,05/18/2018,14:55:00,122.0,05/22/2018,344,ASSAULT 3 & RELATED OFFENSES,101.0,ASSAULT 3,COMPLETED,MISDEMEANOR,STATEN ISLAND,,STREET,N.Y. POLICE DEPT,0.0,,,,955828.0,149854.0,<18,BLACK,M,,40.577953,-74.102317,"(40.577953265, -74.102316756)",PATROL BORO STATEN ISLAND,,<18,WHITE HISPANIC,M
7825496,808565901,11/19/2018,02:25:00,11/19/2018,02:30:00,110.0,11/19/2018,109,GRAND LARCENY,421.0,"LARCENY,GRAND FROM VEHICLE/MOTORCYCLE",COMPLETED,FELONY,QUEENS,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1022372.0,210067.0,UNKNOWN,WHITE,M,,40.743188,-73.862427,"(40.743188053, -73.862427079)",PATROL BORO QUEENS NORTH,,25-44,ASIAN / PACIFIC ISLANDER,M
7825497,649441648,02/03/2018,10:02:00,02/03/2018,12:15:00,122.0,02/03/2018,578,HARRASSMENT 2,638.0,"HARASSMENT,SUBD 3,4,5",COMPLETED,VIOLATION,STATEN ISLAND,FRONT OF,RESIDENCE-HOUSE,N.Y. POLICE DEPT,0.0,,,,959533.0,155574.0,45-64,WHITE,F,,40.593665,-74.089000,"(40.593664627, -74.089000194)",PATROL BORO STATEN ISLAND,,45-64,WHITE,M


In [None]:
# Filtrando o df para os casos de homicídio que iremos trabalhar
ft_crime = df['OFNS_DESC'].str.contains('MURDER|HOMICIDE')

df = df.loc[ft_crime]
df

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,JURISDICTION_CODE,PARKS_NM,HADEVELOPT,HOUSING_PSA,X_COORD_CD,Y_COORD_CD,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
1034,589015329,07/09/2018,14:25:00,,,41.0,07/09/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,OTHER,,,,,1017934.0,232221.0,25-44,WHITE HISPANIC,M,,40.804013,-73.878332,"(40.804012949, -73.878331833)",,,25-44,BLACK,M
2082,842779485,03/14/2018,05:00:00,,,73.0,03/14/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1009869.0,180290.0,25-44,BLACK,M,,40.661502,-73.907661,"(40.661502264, -73.907661037)",,,<18,BLACK,F
2220,886259402,04/18/2018,20:40:00,,,79.0,04/18/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,999636.0,192000.0,,,,,40.693667,-73.944518,"(40.693667261, -73.94451783)",,,25-44,BLACK,M
2940,586251167,06/03/2013,21:45:00,,,104.0,06/03/2013,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1006551.0,198590.0,,,,,40.711741,-73.919560,"(40.711740538, -73.919559767)",,,18-24,WHITE HISPANIC,M
3537,175402761,05/18/2018,08:05:00,,,18.0,05/18/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,990284.0,214597.0,45-64,BLACK,F,,40.755702,-73.978222,"(40.755701867, -73.978221916)",,,<18,WHITE,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7814915,794560311,11/22/2018,03:43:00,11/22/2018,03:44:00,108.0,11/22/2018,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIE",125.0,"HOMICIDE,NEGLIGENT,UNCLASSIFIE",COMPLETED,FELONY,QUEENS,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1006145.0,210199.0,UNKNOWN,WHITE,M,,40.743605,-73.920986,"(40.74360542, -73.920986473)",PATROL BORO QUEENS NORTH,,18-24,WHITE,M
7820560,378477164,07/17/2018,19:58:00,,,32.0,07/17/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,1001749.0,240971.0,25-44,BLACK,M,,40.828076,-73.936771,"(40.828076123, -73.936770657)",,,18-24,BLACK,M
7822010,166327108,10/19/2018,22:50:00,10/19/2018,23:48:00,32.0,10/19/2018,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIE",125.0,"HOMICIDE,NEGLIGENT,UNCLASSIFIE",COMPLETED,FELONY,MANHATTAN,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1001079.0,240747.0,UNKNOWN,UNKNOWN,U,,40.827463,-73.939192,"(40.827462608, -73.939192193)",PATROL BORO MAN NORTH,,25-44,BLACK,M
7823517,522352135,03/14/2018,05:00:00,,,73.0,03/14/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1009869.0,180290.0,25-44,BLACK,M,,40.661502,-73.907661,"(40.661502264, -73.907661037)",,,45-64,BLACK,M


In [None]:
# Filtrando dados nulos da coluna RPT_DT(data de registro da ocorrência)
ft_null2 = df['CMPLNT_FR_DT'].notnull()

df = df.loc[ft_null2]
df

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,JURISDICTION_CODE,PARKS_NM,HADEVELOPT,HOUSING_PSA,X_COORD_CD,Y_COORD_CD,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
1034,589015329,07/09/2018,14:25:00,,,41.0,07/09/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,OTHER,,,,,1017934.0,232221.0,25-44,WHITE HISPANIC,M,,40.804013,-73.878332,"(40.804012949, -73.878331833)",,,25-44,BLACK,M
2082,842779485,03/14/2018,05:00:00,,,73.0,03/14/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1009869.0,180290.0,25-44,BLACK,M,,40.661502,-73.907661,"(40.661502264, -73.907661037)",,,<18,BLACK,F
2220,886259402,04/18/2018,20:40:00,,,79.0,04/18/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,999636.0,192000.0,,,,,40.693667,-73.944518,"(40.693667261, -73.94451783)",,,25-44,BLACK,M
2940,586251167,06/03/2013,21:45:00,,,104.0,06/03/2013,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1006551.0,198590.0,,,,,40.711741,-73.919560,"(40.711740538, -73.919559767)",,,18-24,WHITE HISPANIC,M
3537,175402761,05/18/2018,08:05:00,,,18.0,05/18/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,990284.0,214597.0,45-64,BLACK,F,,40.755702,-73.978222,"(40.755701867, -73.978221916)",,,<18,WHITE,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7814915,794560311,11/22/2018,03:43:00,11/22/2018,03:44:00,108.0,11/22/2018,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIE",125.0,"HOMICIDE,NEGLIGENT,UNCLASSIFIE",COMPLETED,FELONY,QUEENS,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1006145.0,210199.0,UNKNOWN,WHITE,M,,40.743605,-73.920986,"(40.74360542, -73.920986473)",PATROL BORO QUEENS NORTH,,18-24,WHITE,M
7820560,378477164,07/17/2018,19:58:00,,,32.0,07/17/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,1001749.0,240971.0,25-44,BLACK,M,,40.828076,-73.936771,"(40.828076123, -73.936770657)",,,18-24,BLACK,M
7822010,166327108,10/19/2018,22:50:00,10/19/2018,23:48:00,32.0,10/19/2018,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIE",125.0,"HOMICIDE,NEGLIGENT,UNCLASSIFIE",COMPLETED,FELONY,MANHATTAN,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1001079.0,240747.0,UNKNOWN,UNKNOWN,U,,40.827463,-73.939192,"(40.827462608, -73.939192193)",PATROL BORO MAN NORTH,,25-44,BLACK,M
7823517,522352135,03/14/2018,05:00:00,,,73.0,03/14/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1009869.0,180290.0,25-44,BLACK,M,,40.661502,-73.907661,"(40.661502264, -73.907661037)",,,45-64,BLACK,M


In [None]:
# Filtrando o df para range de datas que iremos trabalhar

ftano1 = df['CMPLNT_FR_DT'].str.contains('2021')
ftano2 = df['CMPLNT_FR_DT'].str.contains('2020')
ftano3 = df['CMPLNT_FR_DT'].str.contains('2019')
ftano4 = df['CMPLNT_FR_DT'].str.contains('2018')

df = df.loc[ftano1 | ftano2 |  ftano3 | ftano4]
df

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,JURISDICTION_CODE,PARKS_NM,HADEVELOPT,HOUSING_PSA,X_COORD_CD,Y_COORD_CD,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
1034,589015329,07/09/2018,14:25:00,,,41.0,07/09/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,OTHER,,,,,1017934.0,232221.0,25-44,WHITE HISPANIC,M,,40.804013,-73.878332,"(40.804012949, -73.878331833)",,,25-44,BLACK,M
2082,842779485,03/14/2018,05:00:00,,,73.0,03/14/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1009869.0,180290.0,25-44,BLACK,M,,40.661502,-73.907661,"(40.661502264, -73.907661037)",,,<18,BLACK,F
2220,886259402,04/18/2018,20:40:00,,,79.0,04/18/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,999636.0,192000.0,,,,,40.693667,-73.944518,"(40.693667261, -73.94451783)",,,25-44,BLACK,M
3537,175402761,05/18/2018,08:05:00,,,18.0,05/18/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,990284.0,214597.0,45-64,BLACK,F,,40.755702,-73.978222,"(40.755701867, -73.978221916)",,,<18,WHITE,M
4155,321494527,09/10/2018,08:00:00,,,48.0,09/10/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1016309.0,247659.0,45-64,BLACK,M,,40.846392,-73.884128,"(40.846391886, -73.88412771)",,,45-64,BLACK,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7814915,794560311,11/22/2018,03:43:00,11/22/2018,03:44:00,108.0,11/22/2018,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIE",125.0,"HOMICIDE,NEGLIGENT,UNCLASSIFIE",COMPLETED,FELONY,QUEENS,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1006145.0,210199.0,UNKNOWN,WHITE,M,,40.743605,-73.920986,"(40.74360542, -73.920986473)",PATROL BORO QUEENS NORTH,,18-24,WHITE,M
7820560,378477164,07/17/2018,19:58:00,,,32.0,07/17/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,1001749.0,240971.0,25-44,BLACK,M,,40.828076,-73.936771,"(40.828076123, -73.936770657)",,,18-24,BLACK,M
7822010,166327108,10/19/2018,22:50:00,10/19/2018,23:48:00,32.0,10/19/2018,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIE",125.0,"HOMICIDE,NEGLIGENT,UNCLASSIFIE",COMPLETED,FELONY,MANHATTAN,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1001079.0,240747.0,UNKNOWN,UNKNOWN,U,,40.827463,-73.939192,"(40.827462608, -73.939192193)",PATROL BORO MAN NORTH,,25-44,BLACK,M
7823517,522352135,03/14/2018,05:00:00,,,73.0,03/14/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1009869.0,180290.0,25-44,BLACK,M,,40.661502,-73.907661,"(40.661502264, -73.907661037)",,,45-64,BLACK,M


##💿 **4 - Upload do arquivo original no Data Lake**

In [None]:
# Fazendo o load para enviar o arquivo ao GCS
df.to_csv('ny_orig.csv', index=False)

###⬆️ **Envio ao Google Cloud Storage**

In [None]:
# Função para fazer upload de arquivo no bucket
def upload_blob(bucket, arquivo, destino):
    client = storage.Client()
    bucket = client.bucket(bucket)
    blob = bucket.blob(destino)

    blob.upload_from_filename(arquivo)

    print(
        f"Arquivo {arquivo} enviado a {destino}."
    )

In [None]:
# Upload do arquivo 
bucket = 'projeto-final-agsw'
arquivo = '/content/ny_orig.csv'
destino = 'originais/ny_orig.csv'
upload_blob(bucket, arquivo, destino)

Arquivo /content/ny_orig.csv enviado a originais/ny_orig.csv.


In [None]:
# Confirmando disponibilidade do arquivo pelo pandas
df1 = pd.pandas.read_csv('gs://projeto-final-agsw/originais/ny_orig.csv')
df1

Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,ADDR_PCT_CD,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,BORO_NM,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,JURIS_DESC,JURISDICTION_CODE,PARKS_NM,HADEVELOPT,HOUSING_PSA,X_COORD_CD,Y_COORD_CD,SUSP_AGE_GROUP,SUSP_RACE,SUSP_SEX,TRANSIT_DISTRICT,Latitude,Longitude,Lat_Lon,PATROL_BORO,STATION_NAME,VIC_AGE_GROUP,VIC_RACE,VIC_SEX
0,589015329,07/09/2018,14:25:00,,,41.0,07/09/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,OTHER,,,,,1017934.0,232221.0,25-44,WHITE HISPANIC,M,,40.804013,-73.878332,"(40.804012949, -73.878331833)",,,25-44,BLACK,M
1,842779485,03/14/2018,05:00:00,,,73.0,03/14/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1009869.0,180290.0,25-44,BLACK,M,,40.661502,-73.907661,"(40.661502264, -73.907661037)",,,<18,BLACK,F
2,886259402,04/18/2018,20:40:00,,,79.0,04/18/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,999636.0,192000.0,,,,,40.693667,-73.944518,"(40.693667261, -73.94451783)",,,25-44,BLACK,M
3,175402761,05/18/2018,08:05:00,,,18.0,05/18/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,990284.0,214597.0,45-64,BLACK,F,,40.755702,-73.978222,"(40.755701867, -73.978221916)",,,<18,WHITE,M
4,321494527,09/10/2018,08:00:00,,,48.0,09/10/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1016309.0,247659.0,45-64,BLACK,M,,40.846392,-73.884128,"(40.846391886, -73.88412771)",,,45-64,BLACK,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555,794560311,11/22/2018,03:43:00,11/22/2018,03:44:00,108.0,11/22/2018,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIE",125.0,"HOMICIDE,NEGLIGENT,UNCLASSIFIE",COMPLETED,FELONY,QUEENS,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1006145.0,210199.0,UNKNOWN,WHITE,M,,40.743605,-73.920986,"(40.74360542, -73.920986473)",PATROL BORO QUEENS NORTH,,18-24,WHITE,M
1556,378477164,07/17/2018,19:58:00,,,32.0,07/17/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,OUTSIDE,,N.Y. POLICE DEPT,,,,,1001749.0,240971.0,25-44,BLACK,M,,40.828076,-73.936771,"(40.828076123, -73.936770657)",,,18-24,BLACK,M
1557,166327108,10/19/2018,22:50:00,10/19/2018,23:48:00,32.0,10/19/2018,103,"HOMICIDE-NEGLIGENT,UNCLASSIFIE",125.0,"HOMICIDE,NEGLIGENT,UNCLASSIFIE",COMPLETED,FELONY,MANHATTAN,FRONT OF,STREET,N.Y. POLICE DEPT,0.0,,,,1001079.0,240747.0,UNKNOWN,UNKNOWN,U,,40.827463,-73.939192,"(40.827462608, -73.939192193)",PATROL BORO MAN NORTH,,25-44,BLACK,M
1558,522352135,03/14/2018,05:00:00,,,73.0,03/14/2018,101,MURDER & NON-NEGL. MANSLAUGHTER,,,COMPLETED,FELONY,,INSIDE,,N.Y. POLICE DEPT,,,,,1009869.0,180290.0,25-44,BLACK,M,,40.661502,-73.907661,"(40.661502264, -73.907661037)",,,45-64,BLACK,M


In [None]:
pd.unique(df1['OFNS_DESC'])

array(['MURDER & NON-NEGL. MANSLAUGHTER',
       'HOMICIDE-NEGLIGENT,UNCLASSIFIE', 'HOMICIDE-NEGLIGENT-VEHICLE'],
      dtype=object)

###⬆️ **Envio ao Cloud SQL (MySQL)**

In [None]:
# Indicando parametros de conexão
INSTANCE_CONNECTION_NAME = f"projeto-final-373521:southamerica-east1:mindfull-crime"
print(f"Você se conectou a instância: {INSTANCE_CONNECTION_NAME}")
DB_USER = "root"
DB_PASS = "root"
DB_NAME = "dados-originais"

Você se conectou a instância: projeto-final-373521:southamerica-east1:mindfull-crime


In [None]:
# iniciando variável de conexão
connector = Connector()

# Gerando função de retorno da conexão
def getconn():
    conn = connector.connect(
        INSTANCE_CONNECTION_NAME,
        "pymysql",
        user=DB_USER,
        password=DB_PASS,
        db=DB_NAME
    )
    return conn

# Criando a conexão
pool = sqlalchemy.create_engine(
    "mysql+pymysql://",
    creator=getconn,
)

In [None]:
# Enviando para o MYSQL

# DF lesão
df.to_sql('dfny_orig', con=pool)

In [None]:
# Utilizando sparkSQL para verificar se as tabelas estão no banco 
pool.connect().execute("SHOW TABLES;").fetchall()

[('dfny_orig',),
 ('dfsp_homicidio_orig',),
 ('dfsp_latrocinio_orig',),
 ('dfsp_lesao_orig',),
 ('dfsp_policia_orig',)]

In [None]:
# Para limpar a conexão
connector.close()