# Data Wrangling: Merge, Join, Concatenate

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nbconvert
from IPython.display import display_html

def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

# Merge

* The merge method provides SQL-like capabilities to join two DataFrames together.

* The table shows different options for merging data (Reference: Python for Data Analysis)

![alt text](merge-commands.png "Title")

# Self Assessment

## The questions below are intended help you assess yourself.

## How many questions below can you answer correctly, without assistance? 

## Question 1:

In [2]:
## Question 1:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})

print( "\n\n df1:\t\tdf2:  "  + "\n")

question = "pd.merge(df1, df2, on='key')"

display_side_by_side(df1,df2)

print("Question: " + question )

#display_side_by_side(df1,df2,eval(question)) 



 df1:		df2:  



Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


Question: pd.merge(df1, df2, on='key')


## Question 2:


In [3]:
# Question 2

df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'], 'data2': range(3)})

question = "pd.merge(df3, df4, left_on='lkey', right_on='rkey', sort='True')"

print("\n\n df3:\t\tdf4:  "  + "\n")

display_side_by_side(df3,df4)

print("Question: " + question )
#display_side_by_side(df3,df4,eval(question))



 df3:		df4:  



Unnamed: 0,lkey,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6

Unnamed: 0,rkey,data2
0,a,0
1,b,1
2,d,2


Question: pd.merge(df3, df4, left_on='lkey', right_on='rkey', sort='True')


## Question 3:


In [4]:
# Question 3

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})

question = "pd.merge(df1, df2, how='outer', sort=True)"

print( "\n\n df1: \t\t df2:  "  + "\n")
display_side_by_side(df1,df2)

print("Question: " + question )

#display_side_by_side(df1,df2,eval(question))



 df1: 		 df2:  



Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


Question: pd.merge(df1, df2, how='outer', sort=True)


##  Question 4:

In [5]:
# Question 4

df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})

question = "pd.merge(df1, df2, on='key', how='left', sort='True')"

print( "\n\n df1: \t\t df2:  "  + "\n")

display_side_by_side(df1,df2)

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(df1,df2, eval(question))



 df1: 		 df2:  



Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4




 Question: pd.merge(df1, df2, on='key', how='left', sort='True')

 


## Question 5:

In [6]:
# Question 5

question = "pd.merge(df1, df2, how='inner', sort='True')"

print( "\n\n df1: \t\t df2:  "  + "\n")

display_side_by_side(df1,df2)

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(df1,df2, eval(question))



 df1: 		 df2:  



Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4




 Question: pd.merge(df1, df2, how='inner', sort='True')

 


## Question 6:

In [7]:
# Question 6

left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})

right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})


question = "pd.merge(left, right, on='key1', suffixes=('_left', '_right'), sort= 'True')"

print("\n\n left: \t\t right:  "  + "\n")
display_side_by_side(left,right)

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(left,right, eval(question))



 left: 		 right:  



Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7




 Question: pd.merge(left, right, on='key1', suffixes=('_left', '_right'), sort= 'True')

 


## Question 7:

In [8]:
# Question 7

left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})

right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

question = "pd.merge(left1, right1, left_on='key', right_index=True, sort=True)"

print( "\n\n left1: \tright1:  "  + "\n")

display_side_by_side(left1,right1)

print("\n\n\n Question: " + question + "\n")

#display_side_by_side(left1,right1, eval(question))



 left1: 	right1:  



Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5

Unnamed: 0,group_val
a,3.5
b,7.0





 Question: pd.merge(left1, right1, left_on='key', right_index=True, sort=True)



## Question 8:

In [9]:
# Question 8

left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})

right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})

question = "pd.merge(left, right, on=['key1', 'key2'], how='outer', sort='True')"

print( "\n\n left: \t\t right:  "  + "\n")

display_side_by_side(left,right)

question = "pd.merge(left, right, on=['key1', 'key2'], how='outer', sort='True')"

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(left,right,eval(question))



 left: 		 right:  



Unnamed: 0,key1,key2,lval
0,foo,one,1
1,foo,two,2
2,bar,one,3

Unnamed: 0,key1,key2,rval
0,foo,one,4
1,foo,one,5
2,bar,one,6
3,bar,two,7




 Question: pd.merge(left, right, on=['key1', 'key2'], how='outer', sort='True')

 


## Question 9:

In [10]:
 # Question 9
    
question = "pd.merge(left1, right1, left_on='key', right_index=True, how='outer', sort=True)"

print("\n\n left1: \t\t right1:  "  + "\n")

display_side_by_side(left1,right1)

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(left1,right1, eval(question))



 left1: 		 right1:  



Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5

Unnamed: 0,group_val
a,3.5
b,7.0




 Question: pd.merge(left1, right1, left_on='key', right_index=True, how='outer', sort=True)

 


## Question 10:

In [11]:
# Question 10

lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
                               'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})

righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index=[['Nevada', 'Nevada', 'Ohio', 'Ohio',
                              'Ohio', 'Ohio'],
                             [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns=['event1', 'event2'])

question = "pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, sort=True)"

print( "\n\n lefth: \t\t righth:  "  + "\n")

display_side_by_side(lefth,righth)

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(lefth,righth,eval(question))



 lefth: 		 righth:  



Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Nevada,2002,4.0

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11




 Question: pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, sort=True)

 


## Queston 11:

In [12]:
# Question 11

question = "pd.merge(lefth, righth, left_on=['key1', 'key2'],right_index=True, how='outer', sort=True)"

print("\n\n lefth: \t\t righth:  "  + "\n")

display_side_by_side(lefth,righth)

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(lefth,righth,eval(question))



 lefth: 		 righth:  



Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Nevada,2002,4.0

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11




 Question: pd.merge(lefth, righth, left_on=['key1', 'key2'],right_index=True, how='outer', sort=True)

 


## Question 12:

In [13]:
# Question 12

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])

right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])


question = "pd.merge(left2, right2, how='outer', left_index=True, right_index=True)"

print( "\n\n left2: \t\t  right2:  "  + "\n")

display_side_by_side(lefth,righth)

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(left2,right2,eval(question))



 left2: 		  right2:  



Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Nevada,2002,4.0

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11




 Question: pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

 


# Join

The join aligns a column of one DataFrame to the index of others.

* The index of a DataFrame provides a unique label for each of the rows or columns. 

## Question 13:

In [14]:
# Question 13

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])

right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])


question = "left2.join(right2, how='outer')"

print( "\n\n left2: \t\t right2:  "  + "\n")

display_side_by_side(left2,right2)

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(left2,right2, eval(question))



 left2: 		 right2:  



Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0




 Question: left2.join(right2, how='outer')

 


## Question 14:

In [15]:
# Question 14

print( "\n\n left2: \tright2:  "  + "\n")

display_side_by_side(left1,right1)

question = "left1.join(right1, on='key')"

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(left1,right1, eval(question))



 left2: 	right2:  



Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5

Unnamed: 0,group_val
a,3.5
b,7.0




 Question: left1.join(right1, on='key')

 


## Question 15:

In [16]:
# Question 15

another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])


display_side_by_side(right2,another)

question = "left2.join([right2, another], sort=True)"

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(right2,another, eval(question))

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0




 Question: left2.join([right2, another], sort=True)

 


# Concatenate  

The concat method can combine any number of DataFrames or Series on either axis.

## Question 16:

In [17]:
# Question 16:

df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                   columns=['one', 'two'])

df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                   columns=['three', 'four'])


display_side_by_side(df1,df2)

question = "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],  sort=True)"

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(df1,df2,eval(question))

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5

Unnamed: 0,three,four
a,5,6
c,7,8




 Question: pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],  sort=True)

 


## Question 17:

In [18]:
# Question 17:

display_side_by_side(df1,df2)

question = "pd.concat({'level1': df1, 'level2': df2}, axis=1,  sort=True)"

print("\n\n Question: " + question + "\n\n ")

#display_side_by_side(df1,df2, eval(question))

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5

Unnamed: 0,three,four
a,5,6
c,7,8




 Question: pd.concat({'level1': df1, 'level2': df2}, axis=1,  sort=True)

 
