In [None]:
import numpy as np
import pandas as pd

Helper Functions

In [None]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
    for c in cols}
    return pd.DataFrame(data, ind)

In [None]:
make_df('ABC', range(3))

In [None]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}{1}
    """
    def __init__(self, *args):
        self.args = args
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
            for a in self.args)
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
            for a in self.args)

In [None]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

In [None]:
df1 = make_df('AB', [1, 2])
df2 = make_df('CD', [3, 4])
display('df1', 'df2', 'pd.concat([df1, df2])') # default behavior is to concatenate row-wise within the DataFrame

In [None]:
df3 = make_df('AB', [1, 2])
df4 = make_df('CD', [1, 2])
display('df3', 'df4', "pd.concat([df3, df4], axis='columns')")

Duplicate Indices

In [None]:
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index
display('x', 'y', 'pd.concat([x, y])') # DataFrame preserves duplicate indices after concatenation

In [None]:
# Treat repeated indices as an error:
try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print("Value Error:", e)

In [None]:
# Ignore the index
display('x', 'y', 'pd.concat([x, y], ignore_index=True)')

In [None]:
# Adding multiindex keys
display('x', 'y', "pd.concat([x, y], keys=['x', 'y'])")

Concatenation with Joins

In [None]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
display('df5', 'df6', 'pd.concat([df5, df6])')

In [None]:
display('df5', 'df6', "pd.concat([df5, df6], join='inner')")

In [None]:
pd.concat([df5, df6.reindex(df5.columns, axis=1)])

The append method

In [None]:
# display('df1', 'df2', 'df1.append(df2)') --> probably deprecated

Types of Joins

In [None]:
df1 = pd.DataFrame({'employee': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group': ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame({'employee': ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_date': [2004, 2008, 2012, 2014]})
display('df1', 'df2')

In [None]:
# one-to-one joins
df3 = pd.merge(df1, df2)
df3

In [None]:
# many-to-one joins
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                    'supervisor': ['Carly', 'Guido', 'Steve']})
display('df3', 'df4', 'pd.merge(df3, df4)')

In [None]:
# many-to-many joins
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting', 'Engineering', 'Engineering', 'HR', 'HR'],
                    'skills': ['math', 'spreadsheets', 'software', 'math', 'spreadsheets', 'organization']})
display('df1', 'df5', "pd.merge(df1, df5)")                     

Specification of the Merge Key

In [None]:
# the on keyword
display('df1', 'df2', "pd.merge(df1, df2, on='employee')") # this option works only if both the left and right DataFrames have the specified column name

In [None]:
# the left_on and right_on keywords
df6 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary': [70000, 80000, 120000, 90000]})
display('df1', 'df6', "pd.merge(df1, df6, left_on='employee', right_on='name').drop('name', axis=1)")

In [None]:
display('df1', 'df2')

In [None]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
display('df1a', 'df2a')

In [None]:
# the left_index and right_index keywords
display('df1a', 'df2a', "pd.merge(df1a, df2a, left_index=True, right_index=True)")

In [None]:
# the join method of the DataFrame
df1a.join(df2a)

In [None]:
# mixing left_index with right_on OR right_index with left_on
display('df1a', 'df6', "pd.merge(df1a, df6, left_index=True, right_on='name')")

Set Arithematic for Joins

In [None]:
df7 = pd.DataFrame({'name': ['Peter', 'Paul', 'Mary'],
                    'food': ['fish', 'beans', 'bread']},
                    columns=['name', 'food'])
df8 = pd.DataFrame({'name': ['Mary', 'Joseph'],
                    'drink': ['wine', 'beer']},
                    columns=['name', 'drink'])
display('df7', 'df8', 'pd.merge(df7, df8)') # the result of this merge will contain the intersection of the two sets of inputs

In [None]:
# inner join
pd.merge(df7, df8, how='inner')

In [None]:
# outer join
pd.merge(df7, df8, how='outer') # the result of this merge will contain the union of the two sets of inputs with NaN for missing values in each set of inputs

In [None]:
# left join
pd.merge(df7, df8, how='left') # the result of this merge will contain the intersection of the two sets of inputs along with remaining entries from the left input with NaN for missing values

In [None]:
# right join
pd.merge(df7, df8, how='right') # the result of this merge will contain the intersection of the two sets of inputs along with remaining entries from the right input with NaN for missing values

Overlapping Column Names: The suffixes keyword

In [None]:
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'rank': [1, 2, 3, 4]})
df10 = pd.DataFrame({'name': ['Jake', 'Lisa', 'Bob', 'Sue'],
                     'rank': [3, 4, 2, 1]})
pd.merge(df9, df10, on='name', suffixes=['_L', '_R'])