In [39]:
import pandas as pd
import numpy as np

In [40]:
#create a dataframe
data = {'age': [35, 36, 1.8],
        'height': [180, 155, 83],
        'weight': [72.5, np.nan, 11.3]}
df = pd.DataFrame(data, index=['Ryan', 'Simone', 'Johnny'])
df

Unnamed: 0,age,height,weight
Ryan,35.0,180,72.5
Simone,36.0,155,
Johnny,1.8,83,11.3


In [3]:
#get min values for each col
df.min()

age        1.8
height    83.0
weight    11.3
dtype: float64

In [4]:
#get avg values for each col
df.mean()

age        24.266667
height    139.333333
weight     41.900000
dtype: float64

### Dataframe Joins

In [41]:
#create new series and merge with the current df
education = pd.Series(['Masters', 'Masters', None, 'Bachelors'],
                     index=['Ryan', 'Simone', 'Johnny', 'Alisa'],
                     name='education')
# returns a new DataFrame
df.join(education) #note that the last value wasn't added (this is an inner join, so only common indices will be added)

Unnamed: 0,age,height,weight,education
Ryan,35.0,180,72.5,Masters
Simone,36.0,155,,Masters
Johnny,1.8,83,11.3,


In [42]:
# right join- this will bring in all of the series values above
df.join(education, how='right')

Unnamed: 0,age,height,weight,education
Ryan,35.0,180.0,72.5,Masters
Simone,36.0,155.0,,Masters
Johnny,1.8,83.0,11.3,
Alisa,,,,Bachelors


In [43]:
# reindexing with added row index value - will return NaN for all row values for Kerry
df.reindex(['Ryan', 'Simone', 'Johnny', 'Alisa', 'Kerry'])

Unnamed: 0,age,height,weight
Ryan,35.0,180.0,72.5
Simone,36.0,155.0,
Johnny,1.8,83.0,11.3
Alisa,,,
Kerry,,,


### Dataframe Subsetting

In [16]:
#get all the adults in the dataset (i.e age>18)
adults = df[df.age > 18]
adults

Unnamed: 0,age,height,weight,height_in
Ryan,35.0,180,72.5,70.866142
Simone,36.0,155,,61.023622


In [44]:
#create another column that returns a boolean on whether the individual is an adult or not
df['is_adult'] = df.age > 18
df

Unnamed: 0,age,height,weight,is_adult
Ryan,35.0,180,72.5,True
Simone,36.0,155,,True
Johnny,1.8,83,11.3,False


### Generating New Columns

In [45]:
#generate new column based on calculations
df['height_in'] = df.height / 2.54
df

Unnamed: 0,age,height,weight,is_adult,height_in
Ryan,35.0,180,72.5,True,70.866142
Simone,36.0,155,,True,61.023622
Johnny,1.8,83,11.3,False,32.677165


In [46]:
#generating new column with randomly generated phone numbers
df['phone_number']=np.random.randint(low=100000, high=888888, size=len(df))
df

Unnamed: 0,age,height,weight,is_adult,height_in,phone_number
Ryan,35.0,180,72.5,True,70.866142,147429
Simone,36.0,155,,True,61.023622,273512
Johnny,1.8,83,11.3,False,32.677165,625078


In [47]:
#add a column for index as person_name
df['person_name'] = df.index
df

Unnamed: 0,age,height,weight,is_adult,height_in,phone_number,person_name
Ryan,35.0,180,72.5,True,70.866142,147429,Ryan
Simone,36.0,155,,True,61.023622,273512,Simone
Johnny,1.8,83,11.3,False,32.677165,625078,Johnny


In [48]:
#add a new column called person_last_name
df['person_last_name']=pd.Series(['Gosling', 'De Beauvoir', 'Walker'],
                     index=['Ryan', 'Simone', 'Johnny'],
                                name="person_last_name")
df

Unnamed: 0,age,height,weight,is_adult,height_in,phone_number,person_name,person_last_name
Ryan,35.0,180,72.5,True,70.866142,147429,Ryan,Gosling
Simone,36.0,155,,True,61.023622,273512,Simone,De Beauvoir
Johnny,1.8,83,11.3,False,32.677165,625078,Johnny,Walker


In [49]:
#concatenate both person_name and person_last_name columns and create person_fullname column
df['person_fullname']=df['person_name']+" "+df['person_last_name']
df

Unnamed: 0,age,height,weight,is_adult,height_in,phone_number,person_name,person_last_name,person_fullname
Ryan,35.0,180,72.5,True,70.866142,147429,Ryan,Gosling,Ryan Gosling
Simone,36.0,155,,True,61.023622,273512,Simone,De Beauvoir,Simone De Beauvoir
Johnny,1.8,83,11.3,False,32.677165,625078,Johnny,Walker,Johnny Walker


In [50]:
#drop the other two columns
df=df.drop(['person_name', 'person_last_name'], axis = 1) 
df

Unnamed: 0,age,height,weight,is_adult,height_in,phone_number,person_fullname
Ryan,35.0,180,72.5,True,70.866142,147429,Ryan Gosling
Simone,36.0,155,,True,61.023622,273512,Simone De Beauvoir
Johnny,1.8,83,11.3,False,32.677165,625078,Johnny Walker


In [51]:
#remove name index from dataframe and set numbers instead
s = pd.Series([1, 2, 3])
df=df.set_index(s)
df

In [54]:
#reoder columns
df = df.reindex(columns=['person_fullname','age','height','weight','is_adult','phone_number'])
df

Unnamed: 0,person_fullname,age,height,weight,is_adult,phone_number
1,Ryan Gosling,35.0,180,72.5,True,147429
2,Simone De Beauvoir,36.0,155,,True,273512
3,Johnny Walker,1.8,83,11.3,False,625078
