# Various Methods to Transform DataFrames

In [46]:
import pandas as pd
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
print(df.columns)
df.head()

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


## Inspect Column types
- `object` indicates a string type

In [47]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

## Transform dataframes with basic arithmetic
- works best if the columns are all numerical
- string columns can have interesting results

In [7]:
(df*2).head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,18092,MaleMale,134.0,0,2,YesYes,PrivatePrivate,UrbanUrban,457.38,73.2,formerly smokedformerly smoked,2
1,103352,FemaleFemale,122.0,0,0,YesYes,Self-employedSelf-employed,RuralRural,404.42,,never smokednever smoked,2
2,62224,MaleMale,160.0,0,2,YesYes,PrivatePrivate,RuralRural,211.84,65.0,never smokednever smoked,2
3,120364,FemaleFemale,98.0,0,0,YesYes,PrivatePrivate,UrbanUrban,342.46,68.8,smokessmokes,2
4,3330,FemaleFemale,158.0,2,0,YesYes,Self-employedSelf-employed,RuralRural,348.24,48.0,never smokednever smoked,2


## Adding gives an error 
- because it doesn't know what to do with the string columns

In [None]:
df+2 #run to see the error

In [None]:
df.div
df.mul
df.sub


## when we subset to numerical columns, the addition works

In [34]:
(df[['age','hypertension']]+2).head()

Unnamed: 0,age,hypertension
0,69.0,2
1,63.0,2
2,82.0,2
3,51.0,2
4,81.0,3


# Add new columns to a dataframe

In [10]:
df['hypertension_above_05'] = df['hypertension'] > 0.5
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,hypertension_above_05
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,False
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,False
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,False
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,False
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,True


## Add new columns using  arithmetic operations

In [37]:
df['ageTimes10'] = df['age']*10
df['bmioverage'] = df['bmi']/df['age']
df[['age','bmi','ageTimes10','bmioverage']].head()

Unnamed: 0,age,bmi,ageTimes10,bmioverage
0,67.0,36.6,670.0,0.546269
1,61.0,,610.0,
2,80.0,32.5,800.0,0.40625
3,49.0,34.4,490.0,0.702041
4,79.0,24.0,790.0,0.303797


## Create discrete bins from age column
- look up `cut` and `qcut`
- https://pandas.pydata.org/docs/reference/api/pandas.qcut.html

In [29]:
cut_labels_4 = ['0-18', '18-34', '35-65','65+']
cut_bins = [18, 35, 40,65,200]
df['cut_'] = pd.cut(df['age'], bins=cut_bins, labels=cut_labels_4)
df[['age','cut_']].head()

Unnamed: 0,age,cut_
0,67.0,65+
1,61.0,35-65
2,80.0,65+
3,49.0,35-65
4,79.0,65+


### Strings columns can also be maniplated
- use str method to access string method
- see Pandas str methods https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html

In [38]:
df['gender'].str.startswith('M').head()

0     True
1    False
2     True
3    False
4    False
Name: gender, dtype: bool

In [39]:
df['work_type'].str.replace("e",'3').head()

0          Privat3
1    S3lf-3mploy3d
2          Privat3
3          Privat3
4    S3lf-3mploy3d
Name: work_type, dtype: object

In [40]:
df['work_type'].str.split("_").head()

0          [Private]
1    [Self-employed]
2          [Private]
3          [Private]
4    [Self-employed]
Name: work_type, dtype: object

In [41]:
df['work_type'].str.split("_").str[-1].head()  # get the last element of string split

0          Private
1    Self-employed
2          Private
3          Private
4    Self-employed
Name: work_type, dtype: object

# Add columns using apply method

- functions to manipulate a column or mutiple columns using lambda functions
- lambda functions allow us to define functions on the fly
- https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#row-or-column-wise-function-application


In [42]:
df['hypertensionLess02'] = df['hypertension'].apply(lambda x: x < 0.2)  #apply on a single column
df[['hypertension','hypertensionLess02']].head()

Unnamed: 0,hypertension,hypertensionLess02
0,0,True
1,0,True
2,0,True
3,0,True
4,1,False


### Lambda functions can have multiple arguments
- note the axis argument. axis indicates to apply the function is row wise on dataframe.
- apply is used on the entire dataframe not single columns like in the previous example

In [43]:
df['ageTimesBmi'] = df.apply(lambda x: x['age'] *x['bmi'], axis =1)  
df[['age','bmi','ageTimesBmi']].head()

Unnamed: 0,age,bmi,ageTimesBmi
0,67.0,36.6,2452.2
1,61.0,,
2,80.0,32.5,2600.0
3,49.0,34.4,1685.6
4,79.0,24.0,1896.0


## Some functions are too complex to be used in a lambda function
- we can define a function seperately and apply it


In [44]:
def poly(df):
    return 2*df['age'] +7*df['bmi']

df['poly'] = df.apply(poly,axis=1)
df[['age','bmi','poly']].head()

Unnamed: 0,age,bmi,poly
0,67.0,36.6,390.2
1,61.0,,
2,80.0,32.5,387.5
3,49.0,34.4,338.8
4,79.0,24.0,326.0
