In [1]:
import numpy as np
import pandas as pd

In [2]:
aasd = pd.read_csv('https://vincentarelbundock.github.io/Rdatasets/csv/MASS/Aids2.csv',
                   index_col=False)
aasd.rename({'diag': 'date of diagnosis',
             'death': 'date of death',
             'T.categ': 'mode of transmission',
             'age': 'age at diagnosis'},
            axis='columns', inplace=True)
del aasd['Unnamed: 0']
aasd.head(25)

Unnamed: 0,state,sex,date of diagnosis,date of death,status,mode of transmission,age at diagnosis
0,NSW,M,10905,11081,D,hs,35
1,NSW,M,11029,11096,D,hs,53
2,NSW,M,9551,9983,D,hs,42
3,NSW,M,9577,9654,D,haem,44
4,NSW,M,10015,10290,D,hs,39
5,NSW,M,9971,10344,D,hs,36
6,NSW,M,10746,11135,D,other,36
7,NSW,M,10042,11069,D,hs,31
8,NSW,M,10464,10956,D,hs,26
9,NSW,M,10439,10873,D,hsid,27


In [3]:
# de-abbreviate the terms e.g. "D" into "Deceased" and "hs" into "male homosexual or bisexual contact"

# aasd['state'].unique()
deabbr_state = {'NSW': 'New South Wales',
                'Other': 'Others',
                'QLD': 'Queensland',
                'VIC': 'Victoria'}
aasd['state'] = aasd['state'].map(deabbr_state)

# aasd['status'].unique()
deabbr_status = {'A': 'alive',
                 'D': 'deceased'}
aasd['status'] = aasd['status'].map(deabbr_status)

# aasd['mode of transmission'].unique()
deabbr_mode = {'hs': 'male homosexual/bisexual contact',
               'haem': 'haemophilia/coagulation disorder', 
               'other': 'other/unknown', 
               'hsid': 'male homosexual/bisexual intravenous drug user', 
               'het': 'heterosexual contact', 
               'id': 'female or heterosexual male intravenous drug user', 
               'mother': 'mother with or at risk of HIV infection',
               'blood': 'receipt of blood, blood components or tissue'}
aasd['mode of transmission'] = aasd['mode of transmission'].map(deabbr_mode)

In [4]:
# add a new column based on the calculation date of death - date of diagnosis
aasd['days after diagnosis'] = aasd['date of death'].values - aasd['date of diagnosis'].values

In [5]:
# rearranging the columns
aasd = aasd[['state', 'sex', 'date of diagnosis', 'date of death', 'days after diagnosis', 'status', 'mode of transmission', 'age at diagnosis']]

In [6]:
aasd.describe(include='all')

Unnamed: 0,state,sex,date of diagnosis,date of death,days after diagnosis,status,mode of transmission,age at diagnosis
count,2843,2843,2843.0,2843.0,2843.0,2843,2843,2843.0
unique,4,2,,,,2,8,
top,New South Wales,M,,,,deceased,male homosexual/bisexual contact,
freq,1780,2754,,,,1761,2465,
mean,,,10584.331692,10990.258881,405.92719,,,37.409075
std,,,627.158933,625.369749,363.937949,,,10.063263
min,,,8302.0,8469.0,0.0,,,0.0
25%,,,10163.0,10671.5,128.0,,,30.0
50%,,,10665.0,11235.0,320.0,,,37.0
75%,,,11103.0,11504.0,583.0,,,43.0


In [7]:
aasd.sort_values(by='age at diagnosis', ascending=True)

Unnamed: 0,state,sex,date of diagnosis,date of death,days after diagnosis,status,mode of transmission,age at diagnosis
2021,Others,F,9447,9447,0,deceased,mother with or at risk of HIV infection,0
1793,Queensland,M,8815,8815,0,deceased,"receipt of blood, blood components or tissue",0
1789,Queensland,M,8963,8979,16,deceased,"receipt of blood, blood components or tissue",0
1788,Queensland,M,9023,9039,16,deceased,"receipt of blood, blood components or tissue",0
1790,Queensland,M,9199,9215,16,deceased,"receipt of blood, blood components or tissue",1
2614,Victoria,F,11312,11327,15,deceased,mother with or at risk of HIV infection,1
29,New South Wales,M,11289,11504,215,alive,mother with or at risk of HIV infection,1
1815,Queensland,M,9868,9881,13,deceased,"receipt of blood, blood components or tissue",3
430,New South Wales,M,9436,10938,1502,deceased,"receipt of blood, blood components or tissue",3
933,New South Wales,M,10767,11504,737,alive,mother with or at risk of HIV infection,3


In [8]:
# is there any missing value in the data?
aasd.isnull().any()

state                   False
sex                     False
date of diagnosis       False
date of death           False
days after diagnosis    False
status                  False
mode of transmission    False
age at diagnosis        False
dtype: bool

In [9]:
aasd_alive = aasd[aasd['status'] == 'alive']
aasd_alive.head(25)

Unnamed: 0,state,sex,date of diagnosis,date of death,days after diagnosis,status,mode of transmission,age at diagnosis
14,New South Wales,M,10452,11504,1052,alive,male homosexual/bisexual contact,30
17,New South Wales,M,10923,11504,581,alive,haemophilia/coagulation disorder,21
18,New South Wales,M,10993,11504,511,alive,male homosexual/bisexual contact,56
21,New South Wales,M,10996,11504,508,alive,male homosexual/bisexual contact,38
22,New South Wales,M,10738,11504,766,alive,heterosexual contact,26
23,New South Wales,M,11063,11504,441,alive,female or heterosexual male intravenous drug user,39
25,New South Wales,M,11056,11504,448,alive,haemophilia/coagulation disorder,13
26,New South Wales,M,11283,11504,221,alive,male homosexual/bisexual contact,34
27,New South Wales,M,11195,11504,309,alive,heterosexual contact,39
28,New South Wales,M,10848,11504,656,alive,male homosexual/bisexual contact,31


In [10]:
# the number of the deceased during the study
num_dec = len(aasd) - len(aasd_alive)
num_dec

1761

In [11]:
# the deceased participants at the end of the study are listed
# apart from these participants, the date of death value 11504 indicates the participants are alive as of the end of the study
aasd[(aasd['date of death'] == 11504) & (aasd['status'] == 'deceased')]

Unnamed: 0,state,sex,date of diagnosis,date of death,days after diagnosis,status,mode of transmission,age at diagnosis
965,New South Wales,M,10959,11504,545,deceased,male homosexual/bisexual contact,47
