# Examples in pandas for building dataframes and basic info

In [1]:
# Setup
import pandas as pd
import os

list_weight = [1,2,3,4,5]
list_height = [6,7,8,9,10]

## Build dataframe from list of lists using from_items

In [2]:
list_all = []
list_all.append(["class", ['a','b','b','a','a']])
list_all.append(["weight", [10 * v for v in list_weight]])
list_all.append(["height", [100 * v for v in list_height]])
df = pd.DataFrame.from_items(list_all)
df_all = df.copy()
print(df.to_string())

  class  weight  height
0     a      10     600
1     b      20     700
2     b      30     800
3     a      40     900
4     a      50    1000


In [3]:
# Get more detailed info, including column types
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
class     5 non-null object
weight    5 non-null int64
height    5 non-null int64
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes
None


In [4]:
# Get distribution summary
print(df.describe())

          weight       height
count   5.000000     5.000000
mean   30.000000   800.000000
std    15.811388   158.113883
min    10.000000   600.000000
25%    20.000000   700.000000
50%    30.000000   800.000000
75%    40.000000   900.000000
max    50.000000  1000.000000


In [5]:
# Convert to a list of columns
print(list(df.columns))

['class', 'weight', 'height']


## Cumulative counts and sort by values

In [6]:
# Cumulative count of class column
df['class_cumcount'] = df.groupby('class').cumcount()
print(df)

  class  weight  height  class_cumcount
0     a      10     600               0
1     b      20     700               0
2     b      30     800               1
3     a      40     900               1
4     a      50    1000               2


In [7]:
# Sort by multiple values
df = df.sort_values(by=['class','weight'], ascending=[True,False])
print(df)

  class  weight  height  class_cumcount
4     a      50    1000               2
3     a      40     900               1
0     a      10     600               0
2     b      30     800               1
1     b      20     700               0


## Do explicit type conversion

In [8]:
print('Before:')
print(df.head(1))
print("Type of item in height column: ", type(df['height'].iloc[0]))

for col in ['weight', 'height']:
    df[col] = df[col].astype(float)
 
print("------------")
print('After:')
print(df.head(1))
print("Type of item in height column: ", type(df['height'].iloc[0]))

Before:
  class  weight  height  class_cumcount
4     a      50    1000               2
Type of item in height column:  <class 'numpy.int64'>
------------
After:
  class  weight  height  class_cumcount
4     a    50.0  1000.0               2
Type of item in height column:  <class 'numpy.float64'>


In [9]:
# Print sorted value counts by class groups
df = df['class'].value_counts()
print(df.sort_values(ascending=True))

b    2
a    3
Name: class, dtype: int64


## Build dataframe from list of group values and sort

### Using a for loop:

In [10]:
list_grp = []
for name, grp in df_all.groupby('class'):
    weight_mean = grp['weight'].mean()
    height_mean = grp['height'].mean() 
    r = [name, weight_mean, height_mean]
    list_grp.append(r)
    
df = pd.DataFrame(list_grp, columns=['class', 'weight', 'height'])
df = df.sort_values('height').reset_index(drop=True)
print(df_all)
print()
print(df)

  class  weight  height
0     a      10     600
1     b      20     700
2     b      30     800
3     a      40     900
4     a      50    1000

  class     weight      height
0     b  25.000000  750.000000
1     a  33.333333  833.333333


### Using agg method:

In [29]:
print(df_all)
df = df_all.groupby('class').agg('mean')
print()
print(df)

  class  weight  height
0     a      10     600
1     b      20     700
2     b      30     800
3     a      40     900
4     a      50    1000

          weight      height
class                       
a      33.333333  833.333333
b      25.000000  750.000000


## Read & write dataframes using pickle

In [30]:
# If a test directory does not exist then create it
if not os.path.exists('test'):
    os.makedirs('test')

print("Write dataframe as a pickle file:")
df.to_pickle(os.path.join('test', 'df_test.pkl'))
print(df)
 
print("------------")
print("Read the pickle file and store as dataframe:")
df2 = pd.read_pickle(os.path.join('test', "df_test.pkl"))
print(df2)

Write dataframe as a pickle file:
          weight      height
class                       
a      33.333333  833.333333
b      25.000000  750.000000
------------
Read the pickle file and store as dataframe:
          weight      height
class                       
a      33.333333  833.333333
b      25.000000  750.000000


## Convert to categories for one-hot encoding

In [36]:
# Do one-hot encoding
df_1hot = pd.get_dummies(df_all['class'], prefix='class')
print(df_1hot)

   class_a  class_b
0      1.0      0.0
1      0.0      1.0
2      0.0      1.0
3      1.0      0.0
4      1.0      0.0


In [37]:
# Now that the one-hot columns are built, drop the original category column
df = df_all.drop('class', axis=1)
print(df)

   weight  height
0      10     600
1      20     700
2      30     800
3      40     900
4      50    1000


In [35]:
# Join on index
df = df.join(df_1hot, how='left')
print(df)

   weight  height  class_a  class_b
0      10     600      1.0      0.0
1      20     700      0.0      1.0
2      30     800      0.0      1.0
3      40     900      1.0      0.0
4      50    1000      1.0      0.0
