
# Part 6

# Merging

In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [2]:
temp_data = {'customer':['X','Z','Y','Z','X','X'],'product_code': np.arange(6)} # first create a dictionary
temp_data

{'customer': ['X', 'Z', 'Y', 'Z', 'X', 'X'],
 'product_code': array([0, 1, 2, 3, 4, 5])}

In [3]:
dframe1 = pd.DataFrame(temp_data) # then create a data frame using the data in this dictionary
dframe1

Unnamed: 0,customer,product_code
0,X,0
1,Z,1
2,Y,2
3,Z,3
4,X,4
5,X,5


In [4]:
dframe2 = pd.DataFrame({'cust':['Q','Y','Z'],'state':['TN','KA','MH']}) # create the DF from a dict in one step
dframe2

Unnamed: 0,cust,state
0,Q,TN
1,Y,KA
2,Z,MH


In [5]:
pd.merge(dframe1,dframe2, left_on = 'customer', right_on = 'cust') # inner join; R:merge(x=...,y=...,by=...) or dplyr::inner_join(x,y)

Unnamed: 0,customer,product_code,cust,state
0,Z,1,Z,MH
1,Z,3,Z,MH
2,Y,2,Y,KA


In [None]:
my_merged_dframe = pd.merge(dframe1,dframe2,left_on = 'customer', right_on = 'cust') # explicitly specify which column to merge by

In [None]:
my_merged_dframe

In [6]:
#For performing joins we need a common key so here we will rename the cust column to customer 
dframe2.rename(columns = {'cust':'customer'}, inplace = True)

In [7]:
dframe2

Unnamed: 0,customer,state
0,Q,TN
1,Y,KA
2,Z,MH


In [8]:
pd.merge(dframe1,dframe2,on='customer',how = 'left') # left join

Unnamed: 0,customer,product_code,state
0,X,0,
1,Z,1,MH
2,Y,2,KA
3,Z,3,MH
4,X,4,
5,X,5,


In [None]:
pd.merge(dframe1,dframe2,on='customer',how = 'right') # right join

In [None]:
pd.merge(dframe1,dframe2,on='customer',how = 'outer') # outer join

### Many-to-many merge

In [9]:
dframe3 = pd.DataFrame({'key': ['X', 'X', 'X', 'Y', 'Z', 'Z'],'data_set_3': range(6)})
dframe3

Unnamed: 0,data_set_3,key
0,0,X
1,1,X
2,2,X
3,3,Y
4,4,Z
5,5,Z


In [10]:
dframe4 = pd.DataFrame({'key': ['Y', 'Y', 'X', 'X', 'Z'],'data_set_4': range(5)})
dframe4

Unnamed: 0,data_set_4,key
0,0,Y
1,1,Y
2,2,X
3,3,X
4,4,Z


In [11]:
pd.merge(dframe3,dframe4)

Unnamed: 0,data_set_3,key,data_set_4
0,0,X,2
1,0,X,3
2,1,X,2
3,1,X,3
4,2,X,2
5,2,X,3
6,3,Y,0
7,3,Y,1
8,4,Z,4
9,5,Z,4


In [12]:
# Merging with multiple keys
df_left = pd.DataFrame({'key1': ['SF', 'SF', 'LA'],'key2': ['one', 'two', 'one'],'left_data': [10,20,30]})
df_left

Unnamed: 0,key1,key2,left_data
0,SF,one,10
1,SF,two,20
2,LA,one,30


In [13]:
df_right = pd.DataFrame({'key1': ['SF', 'SF', 'LA', 'LA'],'key2': ['one', 'one', 'one', 'two'],'right_data': [40,50,60,70]})
df_right

Unnamed: 0,key1,key2,right_data
0,SF,one,40
1,SF,one,50
2,LA,one,60
3,LA,two,70


In [14]:
pd.merge(df_left,df_right,how = 'inner')

Unnamed: 0,key1,key2,left_data,right_data
0,SF,one,10,40
1,SF,one,10,50
2,LA,one,30,60


In [None]:
pd.merge(df_left, df_right, on=['key1', 'key2'], how = 'inner')

In [None]:
pd.merge(df_left,df_right,on = 'key1') # merging on a single key when there is more than common column

Automatically renames the columns to identify which is from which df, by adding suffixes _x and _y;

Or we can specify the suffices we want:

In [15]:
pd.merge(df_left,df_right,on = 'key1',suffixes=['_left','_right'])

Unnamed: 0,key1,key2_left,left_data,key2_right,right_data
0,SF,one,10,one,40
1,SF,one,10,one,50
2,SF,two,20,one,40
3,SF,two,20,one,50
4,LA,one,30,one,60
5,LA,one,30,two,70


<a name="concatenate"></a>
# Concatenate

In [16]:
arr1 = np.arange(9).reshape((3,3))
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [17]:
arr2 = (np.arange(9) + 9).reshape((3,3))
arr2

array([[ 9, 10, 11],
       [12, 13, 14],
       [15, 16, 17]])

### Concatenate with <i>numpy</i>

In [18]:
np.concatenate([arr1,arr2],axis=1) # similar to R:cbind

array([[ 0,  1,  2,  9, 10, 11],
       [ 3,  4,  5, 12, 13, 14],
       [ 6,  7,  8, 15, 16, 17]])

In [19]:
np.concatenate([arr1,arr2],axis=0) # similar to R:rbind

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14],
       [15, 16, 17]])

### Concatenate with <i>pandas</i>

In [20]:
ser1 =  Series([0,1,2],index=['T','U','V'])
ser1

T    0
U    1
V    2
dtype: int64

In [29]:
ser2 = Series([3,4,5],index=['T','U','X'])
ser2

T    3
U    4
X    5
dtype: int64

In [30]:
pd.concat([ser1,ser2]) # default is by row (axis = 0) - result is a series

T    0
U    1
V    2
T    3
U    4
X    5
dtype: int64

In [31]:
pd.concat([ser1,ser2],axis=1) # by column (axis = 1) - result is a data frame

Unnamed: 0,0,1
T,0.0,3.0
U,1.0,4.0
V,2.0,
X,,5.0


In [32]:
pd.concat([ser1,ser2],keys = ['cat1','cat2']) # creates an additional (hierarchical) index

cat1  T    0
      U    1
      V    2
cat2  T    3
      U    4
      X    5
dtype: int64

In [None]:
pd.concat([ser1,ser2],keys=['cat1','cat2'],axis = 1) # if we concatenate by column, the keys become column names

### Works the same way for data frames

In [33]:
from numpy.random import randn # if I don't do this, I can still use function randn by calling np.random.randn

In [34]:
dframe1 = pd.DataFrame(randn(4,3), columns=['X', 'Y', 'Z'])
dframe1

Unnamed: 0,X,Y,Z
0,-1.123813,1.39666,0.221301
1,0.76563,-1.148955,-0.153888
2,0.510349,0.833771,-0.941815
3,1.737759,-1.352708,1.748061


In [35]:
dframe2 = pd.DataFrame(randn(3, 3), columns=['Y', 'Q', 'X'])
dframe2

Unnamed: 0,Y,Q,X
0,1.389217,-0.373733,1.197274
1,-2.39179,-0.823822,0.397226
2,-0.260245,1.692877,1.841357


In [36]:
pd.concat([dframe1,dframe2]) # preserves the original indices

Unnamed: 0,Q,X,Y,Z
0,,-1.123813,1.39666,0.221301
1,,0.76563,-1.148955,-0.153888
2,,0.510349,0.833771,-0.941815
3,,1.737759,-1.352708,1.748061
0,-0.373733,1.197274,1.389217,
1,-0.823822,0.397226,-2.39179,
2,1.692877,1.841357,-0.260245,


In [37]:
pd.concat([dframe1,dframe2],ignore_index = True) # ignores original index and creates new (continuous) index

Unnamed: 0,Q,X,Y,Z
0,,-1.123813,1.39666,0.221301
1,,0.76563,-1.148955,-0.153888
2,,0.510349,0.833771,-0.941815
3,,1.737759,-1.352708,1.748061
4,-0.373733,1.197274,1.389217,
5,-0.823822,0.397226,-2.39179,
6,1.692877,1.841357,-0.260245,


In [None]:
pd.concat([dframe1,dframe2],axis = 1)

## End of video 6