# Concatenation

In [None]:
import numpy as np
import pandas as pd

In [2]:
data_one = {'A' : ['A0', 'A1', 'A2', 'A3'], 'B' : ['B0', 'B1', 'B2', 'B3']}

In [3]:
data_two = {'C' : ['C0', 'C1', 'C2', 'C3'], 'D' : ['D0', 'D1', 'D2', 'D3']}

In [4]:
one = pd.DataFrame(data_one)

In [5]:
two = pd.DataFrame(data_two)

In [6]:
one

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [7]:
two

Unnamed: 0,C,D
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [8]:
frames = [one, two]

In [13]:
columns = pd.concat(frames, axis=1) # concatenate along the columns

In [14]:
columns

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [15]:
two.columns = one.columns

In [16]:
two

Unnamed: 0,A,B
0,C0,D0
1,C1,D1
2,C2,D2
3,C3,D3


In [19]:
rows = pd.concat(frames, axis=0)

In [20]:
rows.index = range(len(rows))

In [21]:
rows

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3
4,C0,D0
5,C1,D1
6,C2,D2
7,C3,D3


# MERGE
- Often DataFrames are not in the exact same order or format, meaning we can not simply concatenate them together.
- We need to merge the DataFrames.
- This is analogous to a JOIN command in SQL.
- The .merge() method takes in a key argument labeled how
- There are 3 main ways of merging tables together using the how parameter:
    - inner
    - outer
    - left or right

In [22]:
registrations = pd.DataFrame({'reg_id':[1,2,3,4],'name':['Andrew','Bobo','Claire','David']})
logins = pd.DataFrame({'log_id':[1,2,3,4],'name':['Xavier','Andrew','Yolanda','Bobo']})

In [23]:
registrations

Unnamed: 0,reg_id,name
0,1,Andrew
1,2,Bobo
2,3,Claire
3,4,David


In [24]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


# INNER MERGE

In [25]:
pd.merge(registrations, logins, how='inner', on='name') # represents those who registered and logged in to the event

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2
1,2,Bobo,4


# LEFT AND RIGHT MERGE
- Order of the tables(dataframes) passed in as arguments does matter here!

In [28]:
pd.merge(registrations, logins, how='left', on='name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2.0
1,2,Bobo,4.0
2,3,Claire,
3,4,David,


In [29]:
pd.merge(registrations, logins, how='right', on='name')

Unnamed: 0,reg_id,name,log_id
0,,Xavier,1
1,1.0,Andrew,2
2,,Yolanda,3
3,2.0,Bobo,4


In [32]:
pd.merge(left=registrations, right=logins, how='left', on='name')

Unnamed: 0,reg_id,name,log_id
0,1,Andrew,2.0
1,2,Bobo,4.0
2,3,Claire,
3,4,David,


# OUTER MERGE
- Setting how="outer" allows us to include everything present in both tables.
- Order does not matter

In [33]:
pd.merge(registrations, logins, how='outer', on='name')

Unnamed: 0,reg_id,name,log_id
0,1.0,Andrew,2.0
1,2.0,Bobo,4.0
2,3.0,Claire,
3,4.0,David,
4,,Xavier,1.0
5,,Yolanda,3.0


## Joining on an index instead of a column

In [34]:
registrations = registrations.set_index('name')

In [35]:
registrations

Unnamed: 0_level_0,reg_id
name,Unnamed: 1_level_1
Andrew,1
Bobo,2
Claire,3
David,4


In [37]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [39]:
pd.merge(registrations, logins, left_index=True, right_on='name', how='inner')

Unnamed: 0,reg_id,log_id,name
1,1,2,Andrew
3,2,4,Bobo


## Dealing with two different key column names

In [40]:
registrations = registrations.reset_index()

In [46]:
registrations

Unnamed: 0,reg_name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [42]:
registrations.columns = ['reg_name', 'reg_id']

In [47]:
registrations

Unnamed: 0,reg_name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [48]:
logins

Unnamed: 0,log_id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [52]:
results = pd.merge(registrations, logins, how='inner', left_on='reg_name', right_on='name')

In [54]:
results.drop('reg_name', axis=1)

Unnamed: 0,reg_id,log_id,name
0,1,2,Andrew
1,2,4,Bobo


In [55]:
registrations

Unnamed: 0,reg_name,reg_id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [56]:
registrations.columns = ['name', 'id']

In [57]:
registrations

Unnamed: 0,name,id
0,Andrew,1
1,Bobo,2
2,Claire,3
3,David,4


In [58]:
logins.columns = ['id', 'name']

In [59]:
logins

Unnamed: 0,id,name
0,1,Xavier
1,2,Andrew
2,3,Yolanda
3,4,Bobo


In [61]:
pd.merge(registrations, logins, how='inner', on='name', suffixes=('_reg', '_log'))

Unnamed: 0,name,id_reg,id_log
0,Andrew,1,2
1,Bobo,2,4
