In [12]:
import pandas as pd
import imodels
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

**This notebook fetches and cleans 4 high-stakes datasets: diabetes readmission, juvenile risk prediction, credit card default, compas recidivism.**

# diabetes readmission
https://archive.ics.uci.edu/ml/datasets/diabetes+130-us+hospitals+for+years+1999-2008

In [1]:
!wget 'https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip'
!unzip dataset_diabetes.zip
!rm dataset_diabetes.zip

--2021-04-28 11:33:27--  https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3347213 (3.2M) [application/x-httpd-php]
Saving to: ‘dataset_diabetes.zip’


2021-04-28 11:33:28 (3.18 MB/s) - ‘dataset_diabetes.zip’ saved [3347213/3347213]

Archive:  dataset_diabetes.zip
  inflating: dataset_diabetes/diabetic_data.csv  
  inflating: dataset_diabetes/IDs_mapping.csv  


# juvenile 
https://www.icpsr.umich.edu/web/NACJD/studies/3986

In [11]:
# !pip install gdown --user # might need to install gdown
!gdown 'https://drive.google.com/uc?id=1wEFXutadmevTt1PUpjaDv4XH9KkSMdbx'
!unzip ICPSR_03986.zip
!rm ICPSR_03986.zip

/bin/sh: 1: gdown: not found
Archive:  ICPSR_03986.zip
   creating: ICPSR_03986/
  inflating: ICPSR_03986/.DS_Store   
  inflating: ICPSR_03986/03986-manifest.txt  
   creating: ICPSR_03986/DS0001/
  inflating: ICPSR_03986/DS0001/03986-0001-User_guide.pdf  
   creating: ICPSR_03986/DS0001/03986-0001-Codebook/
  inflating: ICPSR_03986/DS0001/03986-0001-Codebook/Questionnaire.pdf  
  inflating: ICPSR_03986/DS0001/03986-0001-Data.txt  
  inflating: ICPSR_03986/03986-descriptioncitation.html  
  inflating: ICPSR_03986/03986-related_literature.txt  
  inflating: ICPSR_03986/TermsOfUse.html  
  inflating: ICPSR_03986/DS0001/feature_info.csv  


### create df from raw txt data   

In [13]:
raw_rows = open('ICPSR_03986/DS0001/03986-0001-Data.txt').read().split('\n')

# consolidated info from 03986-0001-Codebook/Questionnaire.pdf and 03986-0001-User_guide.pdf
metadata = pd.read_csv('ICPSR_03986/DS0001/feature_info.csv')

In [14]:
rows = [[] for _ in range(len(raw_rows) - 1)]

for i in range(len(rows)):
    for j, l in zip(metadata['start_ind'], metadata['length']):
        rows[i].append(raw_rows[i][j:j+l])

for i in range(len(rows)):
    rows[i] = list(map(lambda x: x.strip(), rows[i]))

In [15]:
df = pd.DataFrame(rows)
df.columns = metadata['feature_name'].values
df.shape

(4023, 280)

### clean missing values

In [16]:
metadata['missing_vals_set'] = metadata['missing_val'].astype(str) + ' ' + metadata['missing_val_2'].astype(str)
metadata['missing_vals_set'] += ' ' + metadata['missing_val_3'].astype(str)
metadata['missing_vals_set'] = (
    metadata['missing_vals_set'].apply(lambda x: set([v[:-2] for v in x.split(' ') if v != 'nan']))
)

df = df.loc[:, ~metadata['over_10_percent_missing'].values]
print('shape', df.shape)

rem_col_missing_val_sets = metadata[~metadata['over_10_percent_missing']]['missing_vals_set']
for i in range(df.shape[1]):
    curr_feat_missing_values = rem_col_missing_val_sets.iloc[i]
    df = df[~df.iloc[:, i].isin(curr_feat_missing_values)]

### separate outcome variables

In [19]:
outcome_variables = metadata['feature_name'][
    metadata['delinquent_behavior'].astype(bool) & metadata['feature_name'].isin(df.columns)
]
drop_variables = list(outcome_variables) + ['id', 'any_deviance']
X_cat, y = df.drop(drop_variables, axis=1), df['any_deviance']

### encode categorical features

In [20]:
categorical_features = metadata['feature_name'][
    metadata['categorical'].astype(bool) & metadata['feature_name'].isin(X_cat.columns)
]
X = pd.get_dummies(X_cat, columns=categorical_features, prefix_sep=':').astype('float32')
y = y.astype('float32')
df_clean = pd.concat([X, y], axis=1)
df.to_csv('ICPSR_03986/DS0001/data_clean.csv', index=False)

In [24]:
# peek at performance for this model
X_train, X_test, y_train, y_test = train_test_split(X, y)
m = imodels.BoostedRulesClassifier(n_estimators=10)
m.fit(X_train, y_train, feature_names=X_train.columns)
print('acc', accuracy_score(y_test, m.predict(X_test)))
m.rules_

# credit card default
https://www.kaggle.com/uciml/default-of-credit-card-clients-dataset

In [28]:
!wget 'https://storage.googleapis.com/kaggle-data-sets/306/666/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210420%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210420T202743Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=1fa7fdcbec2e4ed0a541f79bf45e85b2ce30455481d3d1886e3b4abe0e3d2e6404c748b6f37a9b4d1c0a39101a9a5c5c270059707283ce3486ee472eba3ffbfa871a8fe3da8a49cb8bae918ba9f19a3f21a4af0b0cd9c8b2afd55cd6a3d0638fec4ad1de2298dfe9c3cdf3ec3e36be662a83e0a3a412302eeba4b92e704bd4f91b519b4541d67f5e09d73616a171bc37a03e3f3f24beb128d6916ee3fab0aca26e9fff44617dd05c93b4b0bfe39592e31371afdc17eb4f436b6e8019de60b941d4a9cd3b2277901a2e693fe51021d31cdeb6ad55695a3cffa1ac146b17b8b5982e6b5aa0385e9723f35ce453a5f8f6a490896efc74513673479e4c0824b83899'
!unzip 'archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com%2F20210420%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210420T202743Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=hos'
!rm 'archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com%2F20210420%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210420T202743Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=hos'
!mkdir credit_card
!mv UCI_Credit_Card.csv credit_card

The name is too long, 767 chars total.
Trying to shorten...
New name is archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com%2F20210420%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210420T202743Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=hos.
--2021-04-28 11:36:33--  https://storage.googleapis.com/kaggle-data-sets/306/666/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20210420%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20210420T202743Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=1fa7fdcbec2e4ed0a541f79bf45e85b2ce30455481d3d1886e3b4abe0e3d2e6404c748b6f37a9b4d1c0a39101a9a5c5c270059707283ce3486ee472eba3ffbfa871a8fe3da8a49cb8bae918ba9f19a3f21a4af0b0cd9c8b2afd55cd6a3d0638fec4ad1de2298dfe9c3cdf3ec3e36be662a83e0a3a412302eeba4b92e704bd4f91b519b4541d67f5e09d73616a171bc37a03e3f3f24beb128d6916ee3fab0aca26e9fff44617dd05c93b4b0bfe

# recidivism
https://github.com/propublica/compas-analysis

In [30]:
!git clone https://github.com/propublica/compas-analysis

Cloning into 'compas-analysis'...
remote: Enumerating objects: 31, done.[K
remote: Total 31 (delta 0), reused 0 (delta 0), pack-reused 31[K
Unpacking objects: 100% (31/31), 15.24 MiB | 3.73 MiB/s, done.
Updating files: 100% (12/12), done.


In [31]:
df_raw = pd.read_csv("compas-analysis/compas-scores-two-years.csv")
df_raw['c_jail_time'] = (pd.to_datetime(df_raw['c_jail_out']) - pd.to_datetime(df_raw['c_jail_in'])).dt.days
cols_interest = ['id', 'age', 'c_charge_degree', 'race', 'age_cat', 'score_text', 'sex', 
                 'priors_count', 'days_b_screening_arrest', 'decile_score', 'is_recid', 
                 'c_jail_in', 'c_jail_out', 'c_jail_time', 'two_year_recid']
df = df_raw[cols_interest]

### follow same filtering process as propublica analysis

In [32]:
df = df[df['is_recid'] != -1]
df = df[df['c_charge_degree'] != 'O']
df = df[df['score_text'] != 'N/A']
df = df[df['days_b_screening_arrest'].abs() <= 30]

In [33]:
df = df.drop(['c_jail_in', 'c_jail_out'], axis=1)
df_enc = pd.get_dummies(df, prefix_sep=':')
df_enc.columns = df_enc.columns.str.replace(' ', '_')
X, y = df_enc.drop(['id', 'two_year_recid', 'is_recid'], axis=1), df_enc['is_recid']

df_tgt_last = pd.concat((X, y), axis=1)

In [34]:
df_tgt_last.to_csv('compas-analysis/compas_two_year_clean.csv', index=False)