In [2]:
# !pip install zipfile36

Collecting zipfile36
  Downloading zipfile36-0.1.3-py3-none-any.whl (20 kB)
Installing collected packages: zipfile36
Successfully installed zipfile36-0.1.3


In [1]:
import zipfile
import pandas as pd
from os.path import join
import datetime as dt

data_folder = "data/"

columns_file = ["date", "county_name", "subject_race", "subject_sex", "search_conducted", "search_basis", "outcome"]
columns_df = ["date", "date_dt", "county_name", "subject_race", "subject_sex", "search_conducted", "search_basis", "outcome"]
chunksize = 10 ** 6

df_ca = pd.DataFrame(columns=columns_df)
df_tx = pd.DataFrame(columns=columns_df)

for chunk in pd.read_csv(join(data_folder, "ca_statewide.csv.zip"), compression='zip', usecols=columns_file, chunksize=chunksize, low_memory=False):
  chunk.loc[chunk.search_conducted == False, "search_basis"] = "no_search"
  chunk = chunk.dropna(subset=["county_name"]) # Drop rows without county name
  chunk.loc[:, "outcome"] = chunk.outcome.fillna(value="unknown")
  chunk.insert(loc=1, column="date_dt", value=pd.to_datetime(chunk.date))
  df_ca = pd.concat([df_ca, chunk], axis=0 , ignore_index=True)


for chunk in pd.read_csv(join(data_folder, "tx_statewide.csv.zip"), compression='zip', usecols=columns_file, chunksize=chunksize, low_memory=False):
  chunk.loc[chunk.search_conducted == False, "search_basis"] = "no_search"
  chunk = chunk.dropna()
  chunk.insert(loc=1, column="date_dt", value=pd.to_datetime(chunk.date))
  df_tx = pd.concat([df_tx, chunk], axis=0 , ignore_index=True)

## Pre-processing

In [2]:
print(df_ca.shape, df_tx.shape)

(31683533, 8) (25225803, 8)


### CA

In [3]:
df_ca.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31683533 entries, 0 to 31683532
Data columns (total 8 columns):
 #   Column            Dtype         
---  ------            -----         
 0   date              object        
 1   date_dt           datetime64[ns]
 2   county_name       object        
 3   subject_race      object        
 4   subject_sex       object        
 5   search_conducted  object        
 6   search_basis      object        
 7   outcome           object        
dtypes: datetime64[ns](1), object(7)
memory usage: 1.9+ GB


In [4]:
df_ca.head()

Unnamed: 0,date,date_dt,county_name,subject_race,subject_sex,search_conducted,search_basis,outcome
0,2009-07-01,2009-07-01,Stanislaus County,other,male,False,no_search,unknown
1,2009-07-01,2009-07-01,Stanislaus County,hispanic,female,False,no_search,summons
2,2009-07-01,2009-07-01,Stanislaus County,hispanic,female,True,other,summons
3,2009-07-01,2009-07-01,Stanislaus County,white,female,False,no_search,summons
4,2009-07-01,2009-07-01,Stanislaus County,hispanic,male,True,other,summons


In [5]:
for col in df_ca.columns:
  if col not in ["date", "date_dt"]:
    print(df_ca[col].value_counts())
    print()

Los Angeles County        6008334
San Diego County          2628782
San Bernardino County     1974245
Alameda County            1721328
Orange County             1505389
Kern County               1411456
Santa Barbera County      1307485
Sacramento County         1208623
Riverside County          1143985
Santa Clara County         663582
Fresno County              626866
Solano County              569870
San Joaquin County         557653
Imperial County            535851
Monterey County            486202
San Mateo County           467642
Merced County              450324
San Benito County          427617
Riverside Coujnty          426655
San Francisco County       425715
Tulare County              418383
Sonoma County              407141
Contra Costa County        397764
Marin County               367679
Shasta County              331101
San Luis Obispo County     312748
Stanislaus County          309950
Santa Cruz County          298053
Siskyou County             291215
Mendocino Coun

In [6]:
df_ca.groupby(df_ca.date_dt.dt.year).count()

Unnamed: 0_level_0,date,date_dt,county_name,subject_race,subject_sex,search_conducted,search_basis,outcome
date_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009,2494343,2494343,2494343,2494343,2494303,2494343,2494343,2494343
2010,5060321,5060321,5060321,5060320,5060266,5060321,5060321,5060321
2011,5047002,5047002,5047002,5047002,5047002,5047002,5047002,5047002
2012,4588924,4588924,4588924,4588924,4588924,4588924,4588924,4588924
2013,4415601,4415601,4415601,4415601,4415601,4415601,4415601,4415601
2014,4135931,4135931,4135931,4135931,4135931,4135931,4135931,4135931
2015,4027143,4027143,4027143,4027143,4027143,4027143,4027143,4027143
2016,1914268,1914268,1914268,1914268,1914268,1914268,1914268,1914268


In [7]:
df_ca.county_name.unique().shape

(58,)

## TX

In [8]:
df_tx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25225803 entries, 0 to 25225802
Data columns (total 8 columns):
 #   Column            Dtype         
---  ------            -----         
 0   date              object        
 1   date_dt           datetime64[ns]
 2   county_name       object        
 3   subject_race      object        
 4   subject_sex       object        
 5   search_conducted  object        
 6   search_basis      object        
 7   outcome           object        
dtypes: datetime64[ns](1), object(7)
memory usage: 1.5+ GB


In [9]:
for col in df_tx.columns:
  if col not in ["date", "date_dt"]:
    print(df_tx[col].value_counts())
    print()

Hidalgo County       906303
Harris County        667723
Montgomery County    579636
Cameron County       411815
El Paso County       385367
                      ...  
Foard County           2384
Lipscomb County        2219
Kent County            1846
Borden County          1333
Loving County           202
Name: county_name, Length: 254, dtype: int64

white                     14440205
hispanic                   7286405
black                      2488330
unknown                     597398
asian/pacific islander      377323
other                        36142
Name: subject_race, dtype: int64

male      17254281
female     7971522
Name: subject_sex, dtype: int64

False    24682337
True       543466
Name: search_conducted, dtype: int64

no_search         24682337
consent             255410
other               152179
probable cause      135877
Name: search_basis, dtype: int64

citation     9428392
Name: outcome, dtype: int64



In [10]:
df_tx.shape

(25225803, 8)

In [11]:
## Filter TX so that we have same time range as CA (2009 - 2016)
df_tx = df_tx[df_tx.date_dt.dt.year >= 2009]

In [12]:
df_tx.shape

(17578339, 8)

In [13]:
df_tx.groupby(df_tx.date_dt.dt.year).count()

Unnamed: 0_level_0,date,date_dt,county_name,subject_race,subject_sex,search_conducted,search_basis,outcome
date_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2009,2441270,2441270,2441270,2441270,2441270,2441270,2441270,2441270
2010,2524704,2524704,2524704,2524704,2524704,2524704,2524704,2524704
2011,2587556,2587556,2587556,2587556,2587556,2587556,2587556,2587556
2012,2435070,2435070,2435070,2435070,2435070,2435070,2435070,2435070
2013,2133954,2133954,2133954,2133954,2133954,2133954,2133954,2133954
2014,1878458,1878458,1878458,1878458,1878458,1878458,1878458,1878458
2015,1745352,1745352,1745352,1745352,1745352,1745352,1745352,1745352
2016,1831975,1831975,1831975,1831975,1831975,1831975,1831975,1831975


In [14]:
df_tx.county_name.unique().shape

(254,)

#Saving

In [23]:
df_tx.head()

Unnamed: 0,date,date_dt,county_name,subject_race,subject_sex,search_conducted,search_basis,outcome
7647464,2009-01-01,2009-01-01,Bee County,white,female,False,no_search,warning
7647465,2009-01-01,2009-01-01,Harris County,white,male,False,no_search,warning
7647466,2009-01-01,2009-01-01,Zapata County,hispanic,male,False,no_search,citation
7647467,2009-01-01,2009-01-01,Val Verde County,hispanic,male,False,no_search,warning
7647468,2009-01-01,2009-01-01,Howard County,white,female,False,no_search,warning


In [26]:
df_ca_sample = df_ca.sample(int(1e6))
df_tx_sample = df_tx.sample(int(1e6))


In [27]:
df_ca_sample.to_pickle("data/ca_cleaned_sample.pkl", compression= "gzip")
df_tx_sample.to_pickle("data/tx_cleaned_sample.pkl", compression= "gzip")
