In [1]:
import pandas as pd
import numpy as np

In [4]:
source = "https://www.datos.gov.co/resource/vuyt-mqpw.json"
df = pd.read_json(source + "?$limit=600000")
df.to_json('../data/raw_data/raw_data.json')

In [5]:
df['fecha_hecho'] = pd.to_datetime(df['fecha_hecho'], format = '%d/%m/%Y')

#Adjustments for columns and nulls, (cleaning process).

##Null data treatment

Renaming columns to English language

In [6]:
df.rename({'departamento':'department', 'municipio':'municipality', 'armas_medios':'gun_type', 'fecha_hecho':'date', 'genero':'gender', 'grupo_etario':'age_group', 'cantidad':'cases'}, axis = 1 ,inplace = True)
df.drop('codigo_dane', axis = 1, inplace = True)


In [7]:
df.info(), df.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564113 entries, 0 to 564112
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   department    564113 non-null  object        
 1   municipality  564113 non-null  object        
 2   gun_type      564113 non-null  object        
 3   date          564113 non-null  datetime64[ns]
 4   gender        564113 non-null  object        
 5   age_group     562502 non-null  object        
 6   cases         564113 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 30.1+ MB


(None,
 department              object
 municipality            object
 gun_type                object
 date            datetime64[ns]
 gender                  object
 age_group               object
 cases                    int64
 dtype: object)

In [8]:
empty_info = df.isnull().sum()*100/df.shape[0]
print(f'{empty_info}\n\n{df.isnull().sum()}')

department      0.000000
municipality    0.000000
gun_type        0.000000
date            0.000000
gender          0.000000
age_group       0.285581
cases           0.000000
dtype: float64

department         0
municipality       0
gun_type           0
date               0
gender             0
age_group       1611
cases              0
dtype: int64


In [9]:
df.dropna(subset = ['age_group'], inplace = True)

##Date dtype change

##Department column changes for feature graphs

In [10]:
df.loc[:, 'department'] = df['department'].str.normalize('NFKD').str.encode('ascii', errors = 'ignore').str.decode('utf-8')

In [11]:
df['department'].replace({'SAN ANDRES':'ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA',
                             'VALLE':'VALLE DEL CAUCA',
                             'NARINO':'NARIÑO',
                             'GUAJIRA':'LA GUAJIRA'}, inplace = True)
df.department.unique()

array(['ATLANTICO', 'BOYACA', 'CAQUETA', 'CASANARE', 'CUNDINAMARCA',
       'SUCRE', 'VALLE DEL CAUCA', 'HUILA', 'ANTIOQUIA', 'ARAUCA',
       'BOLIVAR', 'CALDAS', 'CAUCA', 'CESAR', 'CHOCO', 'CORDOBA',
       'MAGDALENA', 'META', 'NARIÑO', 'NORTE DE SANTANDER', 'PUTUMAYO',
       'RISARALDA', 'SANTANDER', 'TOLIMA', 'VAUPES', 'GUAVIARE',
       'LA GUAJIRA', 'QUINDIO', 'AMAZONAS', 'VICHADA', 'GUAINIA',
       'ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA',
       'NO REPORTA'], dtype=object)

In [12]:
df['department'].replace('NO REPORTA', np.nan, inplace = True)       #dropping data without department report
df.dropna(subset = ['department'], inplace = True)

Changing the name of Bogotá municipality to plot maps later

In [13]:
bog_index = df.index[(df['municipality'] == 'BOGOTÁ D.C. (CT)')]
bog_index = list(bog_index)
df.loc[bog_index, 'department'] = 'SANTAFE DE BOGOTA D.C'
df['department'].unique()

array(['ATLANTICO', 'BOYACA', 'CAQUETA', 'CASANARE',
       'SANTAFE DE BOGOTA D.C', 'SUCRE', 'VALLE DEL CAUCA', 'HUILA',
       'ANTIOQUIA', 'ARAUCA', 'BOLIVAR', 'CALDAS', 'CAUCA', 'CESAR',
       'CHOCO', 'CORDOBA', 'CUNDINAMARCA', 'MAGDALENA', 'META', 'NARIÑO',
       'NORTE DE SANTANDER', 'PUTUMAYO', 'RISARALDA', 'SANTANDER',
       'TOLIMA', 'VAUPES', 'GUAVIARE', 'LA GUAJIRA', 'QUINDIO',
       'AMAZONAS', 'VICHADA', 'GUAINIA',
       'ARCHIPIELAGO DE SAN ANDRES PROVIDENCIA Y SANTA CATALINA'],
      dtype=object)

##Little columns changes

In [14]:
print(df['gun_type'].unique())
print(df['gender'].unique())
print(df['age_group'].unique())

['ARMA BLANCA / CORTOPUNZANTE' 'ARMA DE FUEGO' 'CONTUNDENTES' 'CORTANTES'
 'CORTOPUNZANTES' 'NO REPORTADO' 'PUNZANTES' 'SIN EMPLEO DE ARMAS'
 'NO REPORTA' 'ESCOPOLAMINA' '-']
['MASCULINO' 'FEMENINO' 'NO REPORTA' 'NO REPORTADO']
['ADULTOS' 'ADOLESCENTES' 'MENORES' 'NO REPORTA' 'NO REPORTADO']


Artículo 365A del código penal colombiano

Parágrafo 1°. Para los efectos de esta ley, entiéndanse como arma blanca el objeto punzante, cortante, cortocontundente o cortopunzante apto para herir cortar, matar o dañar; que posea bordes filosos o punzantes, tales como navajas, puñales, puñaletas, punzones o cualquier objeto de similares características.

---

Article 365A of the Colombian Penal Code

Paragraph 1. For the purposes of this law, a white weapon is understood to be any sharp, cutting, blunt or sharp object suitable to wound, cut, kill or damage; that has sharp or pointed edges, such as knives, daggers, daggers, stabs, awls or any object of similar characteristics.

In [15]:
#change on genre no reports
df['gender'].replace('NO REPORTADO', 'NO REPORTA', inplace = True)

#change on age_group no reports
df['age_group'].replace('NO REPORTADO', 'NO REPORTA', inplace = True)

#change on gun_type based on Penal Code
df['gun_type'].replace({'ARMA BLANCA / CORTOPUNZANTE':'ARMA BLANCA',
                            '-':'NO REPORTA',
                            'NO REPORTADO':'NO REPORTA',
                            'CORTOPUNZANTES':'ARMA BLANCA',
                            'CORTANTES':'ARMA BLANCA',
                            'CONTUNDENTES':'ARMA BLANCA',
                            'PUNZANTES':'ARMA BLANCA'}, inplace = True)

In [16]:
df.to_csv('../data/processed/data_cleaned.csv', index = False)