# Dataframe indexing
- Indexing to get selection or filters of the data
- row index manipulation
- https://pandas.pydata.org/docs/user_guide/indexing.html

In [2]:
import pandas as pd

In [128]:
df = pd.read_csv("healthcare-dataset-stroke-data.csv") 
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [18]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

## The dataframe has a native numerical index
- the native index is just a range index
- the data seems have its own the `id` column, but we shall ignore that


In [20]:
df.index

RangeIndex(start=0, stop=5110, step=1)

## `iloc` does numerical indexing of the rows
- take first 30 rows

In [77]:
df.iloc[0:5]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [90]:
df.iloc[100:200] # rows 5 to 10

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
100,12363,Male,64.0,0,1,Yes,Govt_job,Urban,74.10,28.8,Unknown,1
101,63973,Female,77.0,0,0,Yes,Govt_job,Rural,190.32,31.4,never smoked,1
102,45277,Female,74.0,0,0,Yes,Private,Rural,231.61,34.6,formerly smoked,1
103,4712,Female,81.0,0,1,Yes,Self-employed,Rural,78.70,19.4,Unknown,1
104,33175,Female,57.0,0,0,Yes,Govt_job,Urban,110.52,28.5,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...
195,64373,Male,59.0,0,0,Yes,Private,Urban,200.62,35.8,formerly smoked,1
196,58267,Male,70.0,1,0,Yes,Private,Rural,242.52,45.5,formerly smoked,1
197,35684,Male,69.0,0,0,Yes,Private,Rural,93.81,28.5,Unknown,1
198,18937,Male,79.0,0,0,Yes,Private,Rural,114.77,,formerly smoked,1


In [144]:
df.iloc[300:].head() # rows from 300 and beyond

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
300,65199,Female,53.0,0,0,Yes,Self-employed,Urban,81.51,28.5,Unknown,0
301,43454,Female,78.0,0,0,No,Self-employed,Urban,137.74,34.9,formerly smoked,0
302,7282,Male,44.0,0,0,Yes,Private,Rural,81.84,25.1,never smoked,0
303,18518,Male,66.0,0,0,Yes,Private,Rural,242.3,35.3,smokes,0
304,41648,Male,27.0,0,0,Yes,Private,Rural,102.64,26.4,smokes,0


## Get a selection of rows and columns

In [146]:
df.iloc[10:15][['heart_disease','Residence_type']]

Unnamed: 0,heart_disease,Residence_type
10,0,Rural
11,1,Rural
12,0,Urban
13,1,Urban
14,1,Urban


## Change range index to a category based index

In [129]:
df_ger = df.set_index('gender')
df_ger.head()

Unnamed: 0_level_0,id,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Male,9046,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
Female,51676,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
Male,31112,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
Female,60182,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
Female,1665,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1



## Get males only


In [133]:
df_ger.loc['Male'].head()

Unnamed: 0_level_0,id,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Male,9046,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
Male,31112,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
Male,56669,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
Male,53882,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
Male,8213,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1


In [147]:
# remove index
df_ger.reset_index().head()

Unnamed: 0,gender,id,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,9046,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,51676,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,31112,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,60182,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,1665,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Can still use the numerical based index

In [150]:
df_ger.iloc[100:105]

Unnamed: 0_level_0,id,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Male,12363,64.0,0,1,Yes,Govt_job,Urban,74.1,28.8,Unknown,1
Female,63973,77.0,0,0,Yes,Govt_job,Rural,190.32,31.4,never smoked,1
Female,45277,74.0,0,0,Yes,Private,Rural,231.61,34.6,formerly smoked,1
Female,4712,81.0,0,1,Yes,Self-employed,Rural,78.7,19.4,Unknown,1
Female,33175,57.0,0,0,Yes,Govt_job,Urban,110.52,28.5,Unknown,1


## Use another index
- select `private` and `Self-employed`

In [136]:
df_ind = df.set_index('work_type')
df_ind.head()

Unnamed: 0_level_0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
work_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Private,9046,Male,67.0,0,1,Yes,Urban,228.69,36.6,formerly smoked,1
Self-employed,51676,Female,61.0,0,0,Yes,Rural,202.21,,never smoked,1
Private,31112,Male,80.0,0,1,Yes,Rural,105.92,32.5,never smoked,1
Private,60182,Female,49.0,0,0,Yes,Urban,171.23,34.4,smokes,1
Self-employed,1665,Female,79.0,1,0,Yes,Rural,174.12,24.0,never smoked,1


## Subsetting the old way

In [135]:
df[df['work_type'].isin(['Private','Self-employed'])].head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Subsetting using indexing

In [138]:
df_ind.loc[['Private','Self-employed']].head()

Unnamed: 0_level_0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
work_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Private,9046,Male,67.0,0,1,Yes,Urban,228.69,36.6,formerly smoked,1
Private,31112,Male,80.0,0,1,Yes,Rural,105.92,32.5,never smoked,1
Private,60182,Female,49.0,0,0,Yes,Urban,171.23,34.4,smokes,1
Private,56669,Male,81.0,0,0,Yes,Urban,186.21,29.0,formerly smoked,1
Private,53882,Male,74.0,1,1,Yes,Rural,70.09,27.4,never smoked,1


## Set multilevel index
-  set an index with multiple columns

In [153]:
df_ind2 = df.set_index(['smoking_status','work_type'])
df_ind2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke
smoking_status,work_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
formerly smoked,Private,9046,Male,67.0,0,1,Yes,Urban,228.69,36.6,1
never smoked,Self-employed,51676,Female,61.0,0,0,Yes,Rural,202.21,,1
never smoked,Private,31112,Male,80.0,0,1,Yes,Rural,105.92,32.5,1
smokes,Private,60182,Female,49.0,0,0,Yes,Urban,171.23,34.4,1
never smoked,Self-employed,1665,Female,79.0,1,0,Yes,Rural,174.12,24.0,1


## Index on the outer level

In [159]:
df_ind2.loc[['never smoked','formerly smoked']]

Unnamed: 0_level_0,Unnamed: 1_level_0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke
smoking_status,work_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
never smoked,Self-employed,51676,Female,61.0,0,0,Yes,Rural,202.21,,1
never smoked,Private,31112,Male,80.0,0,1,Yes,Rural,105.92,32.5,1
never smoked,Self-employed,1665,Female,79.0,1,0,Yes,Rural,174.12,24.0,1
never smoked,Private,53882,Male,74.0,1,1,Yes,Rural,70.09,27.4,1
never smoked,Private,10434,Female,69.0,0,0,No,Urban,94.39,22.8,1
...,...,...,...,...,...,...,...,...,...,...,...
formerly smoked,Self-employed,49598,Male,80.0,0,0,Yes,Urban,120.03,24.3,0
formerly smoked,Govt_job,37680,Male,55.0,0,0,Yes,Rural,108.35,40.8,0
formerly smoked,Private,53525,Female,72.0,0,0,Yes,Urban,83.89,33.1,0
formerly smoked,Self-employed,26214,Female,63.0,0,0,Yes,Rural,75.93,34.7,0


## Index on the outer level and inner level

In [160]:
df_ind2.loc[[('never smoked','Govt_job'),('smokes','Private')]]

Unnamed: 0_level_0,Unnamed: 1_level_0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke
smoking_status,work_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
never smoked,Govt_job,14248,Male,48.0,0,0,No,Urban,84.20,29.7,1
never smoked,Govt_job,8752,Female,63.0,0,0,Yes,Urban,197.54,,1
never smoked,Govt_job,5111,Female,54.0,1,0,Yes,Urban,180.93,27.7,1
never smoked,Govt_job,4639,Female,69.0,0,0,Yes,Urban,82.81,28.0,1
never smoked,Govt_job,63973,Female,77.0,0,0,Yes,Rural,190.32,31.4,1
...,...,...,...,...,...,...,...,...,...,...,...
smokes,Private,11630,Female,25.0,0,0,No,Urban,92.06,25.3,0
smokes,Private,71957,Female,35.0,0,0,Yes,Rural,58.72,40.0,0
smokes,Private,24552,Female,44.0,0,0,Yes,Rural,72.03,37.5,0
smokes,Private,29540,Male,67.0,0,0,Yes,Rural,97.04,26.9,0
