In [76]:
import numpy as np
import pandas as pd
import string

# utility function
newline = '\n'
alphabet = tuple(k for k in string.ascii_lowercase)
print(alphabet[:10])

('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j')


In [77]:
# Series basics
s = pd.Series({'key': 'a', 'val': 1})
s['key'], s['val'], s.index, s.values

('a', 1, Index(['key', 'val'], dtype='object'), array(['a', 1], dtype=object))

In [79]:
# Create a DataFrame using dict of (str: list)
df = pd.DataFrame({'key': alphabet[:4], 'val': np.arange(1,5),})
# each column or row is a pd.Series
print( type(df['val']), type(df.loc[2]))
# index a DataFrame to retrieve a column from it
print(df['val'], newline)
# use DataFrame.loc indexing to retrieve a row from it
print(df.loc[2], newline)
# using DataFrame.head() to summarize data frame
df.head()

<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
0    1
1    2
2    3
3    4
Name: val, dtype: int64 

key    c
val    3
Name: 2, dtype: object 



Unnamed: 0,key,val
0,a,1
1,b,2
2,c,3
3,d,4


In [85]:
# Accessing cells of a data frame by row first, or by column first
df = pd.DataFrame({'key': alphabet[:4], 'val': np.arange(1,5),})
df, df.iloc[1]['val'], df['val'][1]

(  key  val
 0   a    1
 1   b    2
 2   c    3
 3   d    4,
 2,
 2)

In [97]:
# Applying numpy operations on series gives back series
df = pd.DataFrame({'key': alphabet[:6], 'deg': np.linspace(0, 2*np.pi, 6),})
np.sin(df['deg']), np.sin(df['deg'].values)

(0    0.000000e+00
 1    9.510565e-01
 2    5.877853e-01
 3   -5.877853e-01
 4   -9.510565e-01
 5   -2.449294e-16
 Name: deg, dtype: float64,
 array([ 0.00000000e+00,  9.51056516e-01,  5.87785252e-01, -5.87785252e-01,
        -9.51056516e-01, -2.44929360e-16]))

In [105]:
# In-place mutation of columns
df = pd.DataFrame({'key': alphabet[:3], 'val1': np.arange(1,4),
        'val2': np.arange(1,4), 'val3': np.arange(1,4), 'val4': np.arange(1,4)})
df['val2'] /= 2
df[['val1', 'val3']] *= 2
df['val4'] = np.exp(df['val4'])
df

Unnamed: 0,key,val1,val2,val3,val4
0,a,2,0.5,2,2.718282
1,b,4,1.0,4,7.389056
2,c,6,1.5,6,20.085537


In [110]:
# In-place mutation of columns
df = pd.DataFrame({'key': alphabet[:3], 'val1': np.arange(1,4),
        'val2': np.arange(1,4), 'val3': np.arange(2,5), 'val4': np.arange(2,5)})
df[['val1', 'val3']] = 0
df[['val2', 'val4']] = df[['val2', 'val4']].values[0]
df

Unnamed: 0,key,val1,val2,val3,val4
0,a,0,1,0,2
1,b,0,1,0,2
2,c,0,1,0,2


In [26]:
# Concatenate two data frames.
df1 = pd.DataFrame({'key': alphabet[:4], 'val': np.arange(1,5)})
df2 = pd.DataFrame({'key': alphabet[4:8], 'val': np.arange(5,9)})
pd.concat((df1, df2), ignore_index=True)

Unnamed: 0,key,val
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5
5,f,6
6,g,7
7,h,8


In [74]:
# Iterate the rows of a data frame.
df = pd.DataFrame({'key': alphabet[:3], 'val': np.arange(1,4)})
for i, s in df.iterrows():
    print(i, *[ f"  {k}: {s[k]}  " for k in s.index ])

0   key: a     val: 1  
1   key: b     val: 2  
2   key: c     val: 3  


In [28]:
# Create an empty DataFrame
columns = ['key', 'val1', 'val2']
df = pd.DataFrame(columns=columns)
print(df, newline)
# append rows to the end of data frame
df.loc[len(df)] = ['a', 1, 1]
df.loc[len(df)] = ['b', 2, 2]
print(df, newline)
# add a new column to DataFrame and populate it
df['str'] = ['abba', 'zetta']
print(df)

Empty DataFrame
Columns: [key, val1, val2]
Index: [] 

  key val1 val2
0   a    1    1
1   b    2    2 

  key val1 val2    str
0   a    1    1   abba
1   b    2    2  zetta


In [50]:
# Convert a series into a data frame
s = pd.Series({'key': 'a', 'val': 1})
pd.DataFrame([s])

Unnamed: 0,key,val
0,a,1


In [30]:
# Finding unique values of a column in a data frame.
# Create a DataFrame using dict of (str: list)
df = pd.DataFrame({'key': alphabet[:8], 'val': np.arange(1,9),})
df['val'].unique()

array([1, 2, 3, 4, 5, 6, 7, 8])

In [31]:
# Select rows in data frame by column values.
data = {'key': alphabet[:8],
        'val': [1, 2, 2, 4, 2, 6, 7, 8]}
df = pd.DataFrame(data)
df[df['val'] == 2]

Unnamed: 0,key,val
1,b,2
2,c,2
4,e,2


In [31]:
# Merge two data frames (similar to SQL JOIN statement)
# In this use case, I'm using the second data frame to filter
# the first data frame by composite key (frame_id, node_id) 

# See:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

data1 = {
    'frame_id': [1, 1, 1, 2, 2, 2, 3, 3, 3],
    'node_id':  [1, 2, 3, 1, 2, 3, 1, 2, 3],
    'x':        [1, 0, 1, 0, 1, 0, 1, 0, 1],
    'y':        [1, 1, 0, 0, 1, 1, 0, 0, 1],
}
df1 = pd.DataFrame(data1)
data2 = {
    'frame_id': [1, 2, 2, 2, 3, 3],
    'node_id':  [1, 1, 2, 3, 2, 3]
}
df2 = pd.DataFrame(data2)
pd.merge(df1, df2, how='inner', on=['frame_id', 'node_id'])

Unnamed: 0,frame_id,node_id,x,y
0,1,1,1,1
1,2,1,0,0
2,2,2,1,1
3,2,3,0,1
4,3,2,0,0
5,3,3,1,1


In [20]:
# Example of data retrieval with 'composite keys'
data = {
    'frame_id': [1, 1, 1, 2, 2, 2, 3, 3, 3],
    'node_id':  [1, 2, 3, 1, 2, 3, 1, 2, 3],
    'x':        [1, 0, 1, 0, 1, 0, 1, 0, 1],
    'y':        [1, 1, 0, 0, 1, 1, 0, 0, 1],
}
df = pd.DataFrame(data)
s = df[np.logical_and(df['frame_id'] == 1, df['node_id'] == 2)].iloc[0]
s['x']

0

In [13]:
data = {
    'node_id':  [1, 2, 3],
    'x':        [1, 0, 1],
    'y':        [1, 1, 0],
}
df = pd.DataFrame(data)
# Get the values of column x and y as ndarray.
print(df[['x', 'y']].values)
df[['x', 'y']]

[[1 1]
 [0 1]
 [1 0]]


Unnamed: 0,x,y
0,1,1
1,0,1
2,1,0


In [59]:
# Make a DataFrame from a dict of iter where iterable items are 'variables' with associated dict key as 'key'.
data = {'animal':  set(['dog','cat','fish']),
        'color':   ['red','blue','green','yellow'],
        'nothing': [],
        'vehicle': iter(['submarine', 'racecar', 'helicopter'])}

df = pd.DataFrame.from_dict(data, orient='index')
print(df)
df = df.stack().to_frame().reset_index().drop('level_1', axis=1)
df.columns = ['key', 'variable']
df

                 0        1           2       3
animal        fish      dog         cat    None
color          red     blue       green  yellow
nothing       None     None        None    None
vehicle  submarine  racecar  helicopter    None


Unnamed: 0,key,variable
0,animal,fish
1,animal,dog
2,animal,cat
3,color,red
4,color,blue
5,color,green
6,color,yellow
7,vehicle,submarine
8,vehicle,racecar
9,vehicle,helicopter


In [6]:
# Create an empty DataFrame with columns, and default int row indices
columns = ['key', 'val1', 'val2']
data = [['a', 1, 1], ['b', 2, 2], ['a', 1, 1]]
df = pd.DataFrame(data, columns=columns)
print(df)
print()

# selecting rows with key column a
print("Selecting rows where key column contains 'a'")
print(df[df['key'] == 'a'])
print("Number of rows:", len(df[df['key'] == 'a']))
print(isinstance(df[df['key'] == 'a'], pd.DataFrame))
print("'a' in df['key']?", 'a' in df['key'].values )
print()

# selecting rows with key column c
print("Selecting rows where key column contains 'c'")
print("Gives empty data frame")
print(df[df['key'] == 'c'])
print("Number of rows:", len(df[df['key'] == 'c']))
print(isinstance(df[df['key'] == 'c'], pd.DataFrame))
print("'c' in df['key']?", 'c' in df['key'].values )
print()

#
print(df[df['key'] == 'a'].iloc[0])

  key  val1  val2
0   a     1     1
1   b     2     2
2   a     1     1

Selecting rows where key column contains 'a'
  key  val1  val2
0   a     1     1
2   a     1     1
Number of rows: 2
True
'a' in df['key']? True

Selecting rows where key column contains 'c'
Gives empty data frame
Empty DataFrame
Columns: [key, val1, val2]
Index: []
Number of rows: 0
True
'c' in df['key']? False

key     a
val1    1
val2    1
Name: 0, dtype: object


In [91]:
# Create an empty DataFrame with columns, and str row indices
columns = ['val1', 'val2']
df = pd.DataFrame(columns=columns)
print(df)
# append a row to the end of DataFrame
df.loc['a'] = [1, 1]
df.loc['b'] = [2, 2]
df['str'] = ['abba', 'zetta']
print(df)
print("check that ")
print("Index 'b' exists?", 'b' in df.index)
print('c' in df.index)
try:
    df['c']
except KeyError as e:
    print('KeyError', e)
print()

print("Locate a row in a data frame")
print(df.loc['a'])
print()

print("Locate a column in a data frame")
print(df['str'])
print(df['str'].at['a'])
print()

print("Locate a single cell in a data frame")
# DataFrame.at() works similarly to DataFrame.loc()
# can only be used to access cells, not entire axes
print(df.at['a', 'val2'])
print(df.loc['a', 'val2'])
print(df.loc['a'].at['val1'])
print(df.loc['a'].loc['val1'])

Empty DataFrame
Columns: [val1, val2]
Index: []
  val1 val2    str
a    1    1   abba
b    2    2  zetta
check that 
Index 'b' exists? True
False
KeyError 'c'

Locate a row in a data frame
val1       1
val2       1
str     abba
Name: a, dtype: object

Locate a column in a data frame
a     abba
b    zetta
Name: str, dtype: object
abba
abba

Locate a single cell in a data frame
1
1
1
1


In [26]:
# Create a total row at the bottom.
columns = ['val1', 'val2']
df = pd.DataFrame(columns=columns)
df.loc['a'] = [1, 1]
df.loc['b'] = [2, 3]
df.loc['c'] = [3, 9]
# DataFrame.sum() as pd.Series
print(df.sum().astype(int))
print()
df.loc["Total"] = df.sum().astype(int)
print(df)

val1     6
val2    13
dtype: int64

      val1 val2
a        1    1
b        2    3
c        3    9
Total    6   13


In [108]:
# Create an empty DataFrame with columns, and str row indices
columns = ['val1', 'val2', 'str']
data = [[1, 3, 'abba'], [2, 4, 'zetta']]
index = ['a', 'b']
df = pd.DataFrame(data, columns=columns, index=index)
print(df)
df.at['c', 'val1'] = 6
df.at['c', 'val2'] = 7
df.at['c', 'str'] = 'qwerty'
print(df)

   val1  val2    str
a     1     3   abba
b     2     4  zetta
   val1  val2     str
a   1.0   3.0    abba
b   2.0   4.0   zetta
c   6.0   7.0  qwerty
