In [1]:
#using python 3.8.10
#imports 
import pandas as pd
from datetime import datetime
import csv
import numpy as np

#here we are reading in our criminal data
PII_hearttable = pd.read_csv('data/dev_sample_HeartTable_10_17_2022.csv')
acxioim_df = pd.read_csv("data/out/acxiom_demo_cleaned_v1.csv")
agr_df = pd.read_csv("data/out/agr_demo_cleaned_v1.csv")
appriss_df = pd.read_csv("data/out/appriss_crime_cleaned_CFconfidenceapplied_v2.csv")

In [2]:
# if the coulumn 'source_id' contains the substring "agr" then the class is 1 else 2
PII_hearttable['is_suicide'] = np.where(PII_hearttable['source_id'].str.contains('agr'), 0, 1)

# cut down PII_hearttable to only the columns we need
PII_hearttable = PII_hearttable[['source_id', 'is_suicide']]


In [3]:
# print all the columns with the same 'source_id'
appriss_df[appriss_df.duplicated(subset=['source_id'], keep=False)]

Unnamed: 0,source_id,999,1099,1199,1299,1399,2099,3806,5299,5401,...,7399_HasCrime,tier_1_HasCrime,tier_2_HasCrime,tier_3_HasCrime,tier_4_HasCrime,CDCcat_A_HasCrime,CDCcat_B_HasCrime,CDCcat_C_HasCrime,CDCcat_D_HasCrime,CDCcat_E_HasCrime


In [4]:
new_columns = {}
for col in appriss_df.columns:
    if col != 'source_id':
        new_columns[col] = col + '_appriss'

# use the rename method to rename the columns
appriss_df = appriss_df.rename(columns=new_columns)

# combine acxioim_df and PII_hearttable on column 'source_id'
data_to_model_v1 = pd.merge(PII_hearttable, appriss_df, on='source_id', how='left')

data_to_model_v1 = data_to_model_v1.fillna(0)

In [5]:
# convert all columns in dataframe to type in besides column 'source_id' 
for col in data_to_model_v1.columns:
    if col != 'source_id':
        data_to_model_v1[col] = data_to_model_v1[col].astype(int)
data_to_model_v1.dtypes

source_id                    object
is_suicide                    int32
999_appriss                   int32
1099_appriss                  int32
1199_appriss                  int32
                              ...  
CDCcat_A_HasCrime_appriss     int32
CDCcat_B_HasCrime_appriss     int32
CDCcat_C_HasCrime_appriss     int32
CDCcat_D_HasCrime_appriss     int32
CDCcat_E_HasCrime_appriss     int32
Length: 121, dtype: object

In [6]:
# print all the columns with the same 'source_id'
acxioim_df[acxioim_df.duplicated(subset=['source_id'], keep=False)]

Unnamed: 0,source_id,Single Parent,Senior Adult in Household,Young Adult in Household,Education - 1st Person in Household - 100%,Guns and Ammunition,Hunting,Underbanked,Economic Stability Indicator,Population Density,...,Generations in Household_2.0,Generations in Household_3.0,Marital Status in the Household - 100%_A,Marital Status in the Household - 100%_B,Marital Status in the Household - 100%_M,Marital Status in the Household - 100%_S,Presence of Children - 100%_N,Presence of Children - 100%_Y,Home Owner / Renter - 100%_O,Home Owner / Renter - 100%_R
19481,4407_(medical_source)index_2310,0.0,0,0,1.0,0.0,0.0,1,9,1.0,...,0,0,0,1,0,0,1,0,1,0
19482,4407_(medical_source)index_2310,0.0,0,0,1.0,0.0,0.0,1,9,1.0,...,0,0,0,1,0,0,1,0,1,0
19483,4408_(medical_source)index_6634,0.0,0,0,1.0,0.0,0.0,1,9,1.0,...,0,0,0,1,0,0,1,0,1,0
19484,4408_(medical_source)index_6634,0.0,0,0,1.0,0.0,0.0,1,9,1.0,...,0,0,0,1,0,0,1,0,1,0


In [7]:
acxioim_df = acxioim_df.drop_duplicates(subset=['source_id'], keep='first')

# to all the columns in acxioim_df besides 'source_id' add to them '_acxiom' in the column name
# create a dictionary to map the old column names to the new ones
new_columns = {}
for col in acxioim_df.columns:
    if col != 'source_id':
        new_columns[col] = col + '_acxiom'

# use the rename method to rename the columns
acxioim_df = acxioim_df.rename(columns=new_columns)

# combine acxioim_df and PII_hearttable on column 'source_id'
data_to_model_v1 = pd.merge(data_to_model_v1, acxioim_df, on='source_id', how='left')

In [8]:
for col in data_to_model_v1.columns:
    print(data_to_model_v1[col].dtype)

object
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
int32
float64
int64
int64
float64
float64
float64
int64
int64
float64
float64
float64
float64
float64
object
float64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64


In [9]:
# print all the columns with the same 'source_id'
agr_df[agr_df.duplicated(subset=['source_id'], keep=False)]

Unnamed: 0,source_id,VACANT,NO STAT,RESIDENTIAL,DEMO_VETERAN PRESENT IN HH,MULTIPLE PROPERTIES COUNT,BANKRUPCY,DUI,SEX OFFENDER,DIGITAL FLAG,...,OWNER OCCUPIED STATUS_N,OWNER OCCUPIED STATUS_R,OWNER OCCUPIED STATUS_Y,OWNER STATUS TYPE_I,OWNER STATUS TYPE_O,OWNER STATUS TYPE_P,OWNER STATUS TYPE_T,PROPERTY TYPE DESC_A,PROPERTY TYPE DESC_C,PROPERTY TYPE DESC_R
8951,4407_(medical_source)index_2310,0.0,1.0,1,0.0,,0.0,0.0,0.0,,...,0,0,0,0,0,0,0,0,0,0
8952,4408_(medical_source)index_6634,0.0,1.0,1,0.0,,0.0,0.0,0.0,,...,0,0,0,0,0,0,0,0,0,0
8954,4407_(medical_source)index_2310,0.0,1.0,1,0.0,,0.0,0.0,0.0,,...,0,0,0,0,0,0,0,0,0,0
8955,4408_(medical_source)index_6634,0.0,1.0,1,0.0,,0.0,0.0,0.0,,...,0,0,0,0,0,0,0,0,0,0


In [10]:
agr_df = agr_df.drop_duplicates(subset=['source_id'], keep='first')
new_columns = {}
for col in agr_df.columns:
    if col != 'source_id':
        new_columns[col] = col + '_agr'

# use the rename method to rename the columns
agr_df = agr_df.rename(columns=new_columns)

# combine acxioim_df and PII_hearttable on column 'source_id'
data_to_model_v1 = pd.merge(data_to_model_v1, agr_df, on='source_id', how='left')


In [11]:
# for the column 'Suppression - Deceased, Bankruptcy, TLJ - Highest_acxiom' convert the values to 0 and 1
# if the value is 'Y' then the value is 1 else 0
data_to_model_v1['Suppression - Deceased, Bankruptcy, TLJ - Highest_acxiom'] = np.where(data_to_model_v1['Suppression - Deceased, Bankruptcy, TLJ - Highest_acxiom'] == 'Y', 1, 0)

# for the 'DIGITAL FLAG_agr' column convert the values to 0 and 1
# if the value is 'Y' then the value is 1 else 0
data_to_model_v1['DIGITAL FLAG_agr'] = np.where(data_to_model_v1['DIGITAL FLAG_agr'] == 'Y', 1, 0)

# for column 'Single Parent_acxiom' convert to a int
data_to_model_v1['Single Parent_acxiom'] = data_to_model_v1['Single Parent_acxiom'].astype(int)

# for column 'Guns and Ammunition_acxiom' convert to a int
data_to_model_v1['Guns and Ammunition_acxiom'] = data_to_model_v1['Guns and Ammunition_acxiom'].astype(int)

# for column 'MULTIPLE PROPERTIES COUNT_agr' turn all nan values to 0
data_to_model_v1['MULTIPLE PROPERTIES COUNT_agr'] = data_to_model_v1['MULTIPLE PROPERTIES COUNT_agr'].fillna(0)

# for column 'Hunting_acxiom' convert to a int
data_to_model_v1['Hunting_acxiom'] = data_to_model_v1['Hunting_acxiom'].astype(int)

# for the columns 'VACANT_agr','NO STAT_agr', 'RESIDENTIAL_agr', 'DEMO_VETERAN PRESENT IN HH_agr' convert to a int
data_to_model_v1['VACANT_agr'] = data_to_model_v1['VACANT_agr'].astype(int)
data_to_model_v1['NO STAT_agr'] = data_to_model_v1['NO STAT_agr'].astype(int)
data_to_model_v1['RESIDENTIAL_agr'] = data_to_model_v1['RESIDENTIAL_agr'].astype(int)
data_to_model_v1['DEMO_VETERAN PRESENT IN HH_agr'] = data_to_model_v1['DEMO_VETERAN PRESENT IN HH_agr'].astype(int)

# for the columns 'BANKRUPCY_agr', 'DUI_agr', 'SEX OFFENDER_agr' convert to a int
data_to_model_v1['BANKRUPCY_agr'] = data_to_model_v1['BANKRUPCY_agr'].astype(int)
data_to_model_v1['DUI_agr'] = data_to_model_v1['DUI_agr'].astype(int)
data_to_model_v1['SEX OFFENDER_agr'] = data_to_model_v1['SEX OFFENDER_agr'].astype(int)


In [12]:
# create a new subset of the dataframe with only the columns we want
# if the column's type is float64 then add it to new dataframe else skip it
data_to_model_v2 = pd.DataFrame()
for col in data_to_model_v1.columns:
    if data_to_model_v1[col].dtype == 'float64':
        data_to_model_v2[col] = data_to_model_v1[col]

In [13]:
# for every column if the values present are only 1 and 0 then convert the column to type boolean
for col in data_to_model_v1.columns:
    # print(data_to_model_v1[col].dtype)
    if data_to_model_v1[col].dtype == 'int64' or data_to_model_v1[col].dtype == 'float64' or data_to_model_v1[col].dtype == 'int32':
        # with nice formatting print column name and the number of unique values
        # print(f'{col:50} {data_to_model_v1[col].nunique()}')
        if data_to_model_v1[col].nunique() == 2:
            data_to_model_v1[col] = data_to_model_v1[col].astype('bool')

In [14]:
for col in data_to_model_v1.columns:
    # print with nice formating the column name and the type of the column
    print(f'{col:30} {data_to_model_v1[col].dtype}')

source_id                      object
is_suicide                     bool
999_appriss                    int32
1099_appriss                   int32
1199_appriss                   int32
1299_appriss                   int32
1399_appriss                   int32
2099_appriss                   int32
3806_appriss                   int32
5299_appriss                   int32
5401_appriss                   int32
2299_appriss                   int32
2399_appriss                   int32
2411_appriss                   int32
2589_appriss                   int32
2604_appriss                   int32
2699_appriss                   int32
2799_appriss                   int32
2899_appriss                   int32
3605_appriss                   int32
5104_appriss                   bool
1316_appriss                   int32
3704_appriss                   int32
3802_appriss                   int32
3805_appriss                   int32
4004_appriss                   int32
4999_appriss                   int32
50

In [15]:
# how many values are missing from this column 'DEMO_OWN OR RENT_O_agr'
data_to_model_v1['DEMO_OWN OR RENT_O_agr'].isnull().sum()

0

In [15]:
# write the data to a csv file
data_to_model_v1.to_csv('data/out/model_data_v1.csv', index=False)