
# Part 6

# Merging

In [None]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [None]:
temp_data = {'customer':['X','Z','Y','Z','X','X'],'product_code': np.arange(6)} # first create a dictionary
temp_data

In [None]:
dframe1 = pd.DataFrame(temp_data) # then create a data frame using the data in this dictionary
dframe1

In [None]:
dframe2 = pd.DataFrame({'cust':['Q','Y','Z'],'state':['TN','KA','MH']}) # create the DF from a dict in one step
dframe2

In [None]:
pd.merge(dframe1,dframe2, left_on = 'customer', right_on = 'cust') # inner join; R:merge(x=...,y=...,by=...) or dplyr::inner_join(x,y)

In [None]:
my_merged_dframe = pd.merge(dframe1,dframe2,left_on = 'customer', right_on = 'cust') # explicitly specify which column to merge by

In [None]:
my_merged_dframe

In [None]:
#For performing joins we need a common key so here we will rename the cust column to customer 
dframe2.rename(columns = {'cust':'customer'}, inplace = True)

In [None]:
dframe2

In [None]:
pd.merge(dframe1,dframe2,on='customer',how = 'left') # left join

In [None]:
pd.merge(dframe1,dframe2,on='customer',how = 'right') # right join

In [None]:
pd.merge(dframe1,dframe2,on='customer',how = 'outer') # outer join

### Many-to-many merge

In [None]:
dframe3 = pd.DataFrame({'key': ['X', 'X', 'X', 'Y', 'Z', 'Z'],'data_set_3': range(6)})
dframe3

In [None]:
dframe4 = pd.DataFrame({'key': ['Y', 'Y', 'X', 'X', 'Z'],'data_set_4': range(5)})
dframe4

In [None]:
pd.merge(dframe3,dframe4)

In [None]:
# Merging with multiple keys
df_left = pd.DataFrame({'key1': ['SF', 'SF', 'LA'],'key2': ['one', 'two', 'one'],'left_data': [10,20,30]})
df_left

In [None]:
df_right = pd.DataFrame({'key1': ['SF', 'SF', 'LA', 'LA'],'key2': ['one', 'one', 'one', 'two'],'right_data': [40,50,60,70]})
df_right

In [None]:
pd.merge(df_left,df_right,how = 'inner')

In [None]:
pd.merge(df_left, df_right, on=['key1', 'key2'], how = 'inner')

In [None]:
pd.merge(df_left,df_right,on = 'key1') # merging on a single key when there is more than common column

Automatically renames the columns to identify which is from which df, by adding suffixes _x and _y; 

<i>Or we can specify the suffices we want:</i>

In [None]:
pd.merge(df_left,df_right,on = 'key1',suffixes=['_left','_right'])

# Merge on index

In [None]:
df_left = pd.DataFrame({'key': ['X','Y','Z','X','Y'],'data': range(5)})
df_left

In [None]:
df_right = pd.DataFrame({'group_data': [10, 20]}, index=['X', 'Y'])
df_right

In [None]:
pd.merge(df_left,df_right,left_on = 'key',right_index = True)

In [None]:
df_left_hr = pd.DataFrame({'key1': ['SF','SF','SF','LA','LA'],
                   'key2': [10, 20, 30, 20, 30],
                   'data_set': np.arange(5.)})
df_left_hr

In [None]:
df_right_hr = pd.DataFrame(np.arange(10).reshape((5, 2)),
                   index=[['LA','LA','SF','SF','SF'],[20, 10, 10, 10, 20]],
                   columns=['col_1', 'col_2'])
df_right_hr

In [None]:
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True)

Alternative to merge - join

In [None]:
df_left.join(df_right)

<a name="concatenate"></a>
# Concatenate

In [None]:
arr1 = np.arange(9).reshape((3,3))
arr1

In [None]:
arr2 = (np.arange(9) + 9).reshape((3,3))
arr2

### Concatenate with <i>numpy</i>

In [None]:
np.concatenate([arr1,arr2],axis=1) # similar to R:cbind

In [None]:
np.concatenate([arr1,arr2],axis=0) # similar to R:rbind

### Concatenate with <i>pandas</i>

In [None]:
ser1 =  Series([0,1,2],index=['T','U','V'])
ser1

In [None]:
ser2 = Series([3,4],index=['X','Y'])
ser2

In [None]:
pd.concat([ser1,ser2]) # default is by row (axis = 0) - result is a series

In [None]:
pd.concat([ser1,ser2],axis=1) # by column (axis = 1) - result is a data frame

In [None]:
pd.concat([ser1,ser2],keys = ['cat1','cat2']) # creates an additional (hierarchical) index

In [None]:
pd.concat([ser1,ser2],keys=['cat1','cat2'],axis = 1) # if we concatenate by column, the keys become column names

### Works the same way for data frames

In [None]:
from numpy.random import randn # if I don't do this, I can still use function randn by calling np.random.randn

In [None]:
dframe1 = pd.DataFrame(randn(4,3), columns=['X', 'Y', 'Z'])
dframe1

In [None]:
dframe2 = pd.DataFrame(randn(3, 3), columns=['Y', 'Q', 'X'])
dframe2

In [None]:
pd.concat([dframe1,dframe2]) # preserves the original indices

In [None]:
pd.concat([dframe1,dframe2],ignore_index = True) # ignores original index and creates new (continuous) index

In [None]:
pd.concat([dframe1,dframe2],axis = 1)

## End of video 6