# Examples in pandas for building dataframes

## Setup lists

In [1]:
# Setup
import pandas as pd
import os
from collections import OrderedDict

# List comprehensions
classes = ['a','b','b','a','a']
weights = [10 * v for v in [1,2,3,4,5]]
heights = [100 * v for v in  [1,2,3,4,5]]

print("Initial lists:")
print(classes)
print(weights)
print(heights)

Initial lists:
['a', 'b', 'b', 'a', 'a']
[10, 20, 30, 40, 50]
[100, 200, 300, 400, 500]


##Build a dataframe from the lists

In [2]:
di_all = {
    'class': classes,
    'weight': weights,
    'height': heights
}

df_all = pd.DataFrame(OrderedDict(di_all))
print(df_all)

  class  weight  height
0     a      10     100
1     b      20     200
2     b      30     300
3     a      40     400
4     a      50     500


## Cumulative counts and sort by values

In [0]:
# Cumulative count of class column
df = df_all.copy()
df['class_cumcount'] = df.groupby('class').cumcount() + 1  # counts start from 0
print(df)

In [0]:
# Sort by multiple values in a specific order
df = df.sort_values(by=['class','weight'], ascending=[True,False])
print(df)

  class  weight  height  class_cumcount
4     a      50     500               3
3     a      40     400               2
0     a      10     100               1
2     b      30     300               2
1     b      20     200               1


In [0]:
# Reset index
print(df.reset_index(drop=True))

  class  weight  height  class_cumcount
0     a      50     500               3
1     a      40     400               2
2     a      10     100               1
3     b      30     300               2
4     b      20     200               1


## Do explicit type conversion (from int to float)

In [0]:
print('Before:')
print(df.head(1))
print("Type of item in height column: ", df['height'].dtype)

# Copy dataframe and convert each series type from int to float
df2 = df.copy()
for col in df2.columns:
    if df2[col].dtype == 'int64':
      df2[col] = df2[col].astype(float)
 
print("------------")
print('After:')
print(df2.head(1))
print("Type of item in height column: ", df2['height'].dtype)

Before:
  class  weight  height  class_cumcount
4     a      50     500               3
Type of item in height column:  int64
------------
After:
  class  weight  height  class_cumcount
4     a    50.0   500.0             3.0
Type of item in height column:  float64


In [0]:
# Print sorted value counts by class groups
df2 = df2['class'].value_counts()
print(df2.sort_values(ascending=True))

b    2
a    3
Name: class, dtype: int64


## Calculate aggregated statistics on a dataframe grouped by each class

In [0]:
print("Original classes:")
print(df_all)

Original classes:
  class  weight  height
0     a      10     100
1     b      20     200
2     b      30     300
3     a      40     400
4     a      50     500


### Using agg method (the short way - with multi index columns):

In [0]:
dfg = df_all.groupby('class').agg(['mean', 'std'])
print(dfg.columns)
print('---')
print('Multi index dataframe:')
print(dfg)

MultiIndex(levels=[['weight', 'height'], ['mean', 'std']],
           labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
---
Multi index dataframe:
          weight                 height            
            mean        std        mean         std
class                                              
a      33.333333  20.816660  333.333333  208.166600
b      25.000000   7.071068  250.000000   70.710678


### Using agg method (the short way - with single index columns):

In [0]:
dfg = df_all.groupby('class').agg(['mean', 'std'])

# Flatten multi index into single index columns
dfg.columns = ['_'.join(col) for col in dfg.columns]
print(dfg.columns)
print('---')
print('Single index dataframe:')
print(dfg)

Index(['weight_mean', 'weight_std', 'height_mean', 'height_std'], dtype='object')
---
Single index dataframe:
       weight_mean  weight_std  height_mean  height_std
class                                                  
a        33.333333   20.816660   333.333333  208.166600
b        25.000000    7.071068   250.000000   70.710678


### Using a for loop (the long way - if we also want to process each group):

In [0]:
li_grp = []
for name, grp in df_all.groupby('class'):  # note that grp here is of type DataFrame
    weight_mean = grp['weight'].mean()
    height_mean = grp['height'].mean()
    weight_std = grp['weight'].std()
    height_std = grp['height'].std()
 
    print(name)
    print(grp)
    print("---")
    li_grp.append([name, weight_mean, weight_std, height_mean, height_std])
    
COLS = ['name', 'weight_mean', 'weight_std', 'height_mean', 'height_std']
dfg = pd.DataFrame(li_grp, columns=COLS)

print("Grouped classes:")
print(dfg)

a
  class  weight  height
0     a      10     100
3     a      40     400
4     a      50     500
---
b
  class  weight  height
1     b      20     200
2     b      30     300
---
Grouped classes:
  name  weight_mean  weight_std  height_mean  height_std
0    a    33.333333   20.816660   333.333333  208.166600
1    b    25.000000    7.071068   250.000000   70.710678


## Read & write dataframes using pickle

In [0]:
# If a test directory does not exist then create it
if not os.path.exists('test'):
    os.makedirs('test')

print("Write dataframe as a pickle file:")
dfg.to_pickle(os.path.join('test', 'df_test.pkl'))
print(dfg)
 
print("------------")
print("Read the pickle file and store as dataframe:")
dfg2 = pd.read_pickle(os.path.join('test', "df_test.pkl"))
print(dfg2)

Write dataframe as a pickle file:
  name  weight_mean  weight_std  height_mean  height_std
0    a    33.333333   20.816660   333.333333  208.166600
1    b    25.000000    7.071068   250.000000   70.710678
------------
Read the pickle file and store as dataframe:
  name  weight_mean  weight_std  height_mean  height_std
0    a    33.333333   20.816660   333.333333  208.166600
1    b    25.000000    7.071068   250.000000   70.710678


## Convert to categories for one-hot encoding

In [0]:
# Do one-hot encoding of the class columns
# Splits class into multiple columns according to the different class values
# Here we have just two class values ('a' and 'b')
df_1hot = pd.get_dummies(df_all['class'], prefix='class')
print('1-hot encoded classes:')
print(df_1hot)

1-hot encoded classes:
   class_a  class_b
0        1        0
1        0        1
2        0        1
3        1        0
4        1        0


In [0]:
# Now that the one-hot columns are built, drop the original category column
df_final = df_all.drop('class', axis=1)
print('Original dataframe:')
print(df_final)
print('---')

# Finally, left join the above df_1hot dataframe using the index value
print('Final dataframe:')
df_final = df_final.join(df_1hot, how='left')
print(df_final)

Original dataframe:
   weight  height
0      10     100
1      20     200
2      30     300
3      40     400
4      50     500
---
Final dataframe:
   weight  height  class_a  class_b
0      10     100        1        0
1      20     200        0        1
2      30     300        0        1
3      40     400        1        0
4      50     500        1        0
