In [84]:
import pandas as pd
import numpy as np

In [4]:
dat = pd.read_csv('../data/raw/Aids2.csv')

In [5]:
dat.head()

Unnamed: 0.1,Unnamed: 0,state,sex,diag,death,status,T.categ,age
0,1,NSW,M,10905,11081,D,hs,35
1,2,NSW,M,11029,11096,D,hs,53
2,3,NSW,M,9551,9983,D,hs,42
3,4,NSW,M,9577,9654,D,haem,44
4,5,NSW,M,10015,10290,D,hs,39


In [37]:
dat.drop('Unnamed: 0', axis=1, inplace=True)

From: https://vincentarelbundock.github.io/Rdatasets/doc/MASS/Aids2.html

## Australian AIDS Survival Data
### Description
Data on patients diagnosed with AIDS in Australia before 1 July 1991.

### Usage
Aids2

### Format
This data frame contains 2843 rows and the following columns:

##### state
Grouped state of origin: "NSW "includes ACT and "other" is WA, SA, NT and TAS.

##### sex
Sex of patient.

##### diag
(Julian) date of diagnosis.

##### death
(Julian) date of death or end of observation.

##### status
"A" (alive) or "D" (dead) at end of observation.

##### T.categ
Reported transmission category.

##### age
Age (years) at diagnosis.

### Note
This data set has been slightly jittered as a condition of its release, to ensure patient confidentiality.

### Source
Dr P. J. Solomon and the Australian National Centre in HIV Epidemiology and Clinical Research.

### References
Venables, W. N. and Ripley, B. D. (2002) Modern Applied Statistics with S. Fourth edition. Springer.

In [7]:
dat.state.unique()

array(['NSW', 'Other', 'QLD', 'VIC'], dtype=object)

In [8]:
dat.status.unique()

array(['D', 'A'], dtype=object)

In [9]:
dat['T.categ'].unique()

array(['hs', 'haem', 'other', 'hsid', 'het', 'id', 'mother', 'blood'],
      dtype=object)

In [78]:
X = dat.copy()[['state', 'sex', 'T.categ', 'age']]
y = pd.DataFrame({
    'tte': dat.death - dat.diag,
    'event': [1 if val == 'D' else 0 for val in dat.status]
})
X

Unnamed: 0,state,sex,T.categ,age
0,NSW,M,hs,35
1,NSW,M,hs,53
2,NSW,M,hs,42
3,NSW,M,haem,44
4,NSW,M,hs,39
...,...,...,...,...
2838,Other,M,het,46
2839,Other,F,het,34
2840,Other,M,haem,49
2841,Other,M,hs,55


In [79]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [80]:
# Categorical boolean mask
categorical_feature_mask = X.dtypes==object

In [91]:
cat_names = X.columns[categorical_feature_mask].tolist()
num_names = X.columns[~categorical_feature_mask].tolist()
print('Categoricals: ', cat_names, '\nNumerics: ', num_names)

Categoricals:  ['state', 'sex', 'T.categ'] 
Numerics:  ['age']


In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [63]:
ohe = OneHotEncoder(sparse=False).fit(X_train[cat_names])
scaler = StandardScaler().fit(X_train[num_names])

In [89]:
X_train = np.concatenate(
    (ohe.transform(X_train[cat_names]), scaler.transform(X_train[num_names])),
    axis=1
)

In [90]:
X_test = np.concatenate(
    (ohe.transform(X_test[cat_names]), scaler.transform(X_test[num_names])),
    axis=1
)