# Data Assembly

Objectives
* Concatenating Data
* Merging Data sets

## Concatenation

* Staking the dataframes on top of each other.
* Dataframes are passed as a list

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df1 = pd.read_csv('../data/concat_1.csv')
df2 = pd.read_csv('../data/concat_2.csv')
df3 = pd.read_csv('../data/concat_3.csv')

In [None]:
df1.shape

In [None]:
df1

In [None]:
df2.shape

In [None]:
df2

In [None]:
df3.shape

In [None]:
df3

In [None]:
dfs_list = [df1, df2, df3]

row_concat = pd.concat(dfs_list)

In [None]:
row_concat.shape

In [None]:
row_concat

In [None]:
# subset the fourth row of the concatenated dataframe
# 2 min

In [None]:
row_concat.iloc[3, :]

In [None]:
# create a new_row_series and append to the dataframe
# 3 min

In [None]:
new_row_series = pd.Series(['n1', 'n2', 'n3', 'n4'])

In [None]:
type(new_row_series)

In [None]:
new_row_series

In [None]:
# what happened?
pd.concat([row_concat, new_row_series])

In [None]:
# to fix this problem, turn the series into a dataframe

new_row_dataframe = pd.DataFrame([['n1', 'n2', 'n3', 'n4']], 
                                columns = ['A', 'B', 'C', 'D'])

In [None]:
type(new_row_dataframe)

In [None]:
new_row_dataframe

In [None]:
# what happened to the index number?
pd.concat([row_concat, new_row_dataframe])

In [None]:
# to fix this, set the ignore_index to True
pd.concat([row_concat, new_row_dataframe], ignore_index=True)

## Append

To append a single object to a dataframe, the append function can handle the task

In [None]:
# using a DataFrame
df1.append(df2, ignore_index=True)

In [None]:
# using a single-row DataFrame
df1.append(new_row_dataframe)

In [None]:
# using Python dictionary
data_dict = {'A': 'dd1', 'B': 'dd2', 'C': 'dd3', 'D': 'dd4'}

# df1.append(data_dict)

In [None]:
# Fix
df1.append(data_dict, ignore_index=True)

## Adding Columns

Adding a single column to a dataframe can be done directly without using any specific Pandas function.

In [None]:
# Try to concatenate df1, df2, df3
# 3 min

In [None]:
col_concat = pd.concat([df1, df2, df3], axis=1)

In [None]:
col_concat

In [None]:
# set ignore_index to True
col_concat = pd.concat([df1, df2, df3], axis=1, ignore_index=True)
col_concat

In [None]:
# concat with join='inner'
# keep the results only when there are matching indices by using join='inner
pd.concat([df1, df3], axis=1, join='inner')

In [None]:
# Now try to add a new column 'AA'
# 2 min

In [None]:
col_concat['AA'] = ['aa1', 'aa2', 'aa3', 'aa4']
col_concat