# Data Wrangling

This template contains instructions to learn more about data wrangling

It also describes a Machine Learning template for data cleaning in this file

# Objective 

Following aspects of data wrangling are described in this template
1)  Database-style DataFrame merges
2)  Merging of DataFrames on Index
3)  Concatenating, binding or stacking along an axis
4)  Combining Data with Overlap
5)  Reshaping and Pivoting
6)  Removing duplicates
7)  Transfoming data using a Function or Mapping
8)  Replacing values

In [None]:
#importing libraries

# Database-style DataFrame merges

In [None]:
# Merge or join operations combine data sets by linking rows using one or more keys. 
#These operations are central to relational databases.

In [None]:
#Code for creating dummy dataset
pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})
pd.DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})

In [None]:
# Merging two different dataframes on the basis of a key whose name is different in two dataframes

In [None]:
#Code for creating dummy dataset
pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})
pd.DataFrame({'rkey': ['a', 'b', 'd'],'data2': range(3)})

In [None]:
# Different types of join like inner, outer, left, right

In [None]:
#Code for creating dummy dataset
pd.DataFrame({'key1': ['foo', 'foo', 'bar'],'key2': ['one', 'two', 'one'],'lval': [1, 2, 3]})
pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],'key2': ['one', 'one', 'one', 'two'], 'rval': [4, 5, 6, 7]})

# Merging of DataFrames on Index

In [None]:
# In some cases, the merge key or keys in a DataFrame will be found in its index. In this case, 
# you can pass left_index=True or right_index=True (or both) to indicate that the index should be used as the merge key

In [None]:
#Code for creating dummy dataset
pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],'value': range(6)})
pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

In [None]:
# Using the indexes of both sides of the merge is also not an issue:

In [None]:
#Code for creating dummy dataset
pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada'])
pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])

In [None]:
# for simple index-on-index merges, you can pass a list of DataFrames to join as an alternative to using the more general concat function

In [None]:
#Code for creating dummy dataset
pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada'])
pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])

# Concatenating, binding or stacking along an axis

In [None]:
# NumPy has a concatenate function for doing this with raw NumPy arrays

In [None]:
#Code for creating dummy dataset
np.arange(12).reshape((3, 4))

In [None]:
# Concatenation using Pandas's concat function

In [None]:
#Code for creating dummy dataset
pd.Series([0, 1], index=['a', 'b'])
pd.Series([2, 3, 4], index=['c', 'd', 'e'])
pd.Series([5, 6], index=['f', 'g'])

In [None]:
# Applying the concatenation concept to DataFrames

In [None]:
#Code for creating dummy dataset
pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])
pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four'])

# Combining Data with Overlap

In [None]:
# Another data combination situation can’t be expressed as either a merge or concatenation operation. 
# You may have two datasets whose indexes overlap in full or part.

In [None]:
#Code for creating dummy dataset
pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])
pd.Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])

# Reshaping and Pivoting

In [None]:
#Stacking & Unstacking a nested dataframe

In [1]:
#Code for creating dummy dataset
pd.DataFrame(np.arange(6).reshape((2, 3)),index=pd.Index(['Ohio', 'Colorado'], name='state'),columns=pd.Index(['one', 'two', 'three'], name='number'))

# Removing duplicates

In [None]:
#Removing duplicates present in a DataFrame

In [None]:
#Code for creating dummy dataset
pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,'k2': [1, 1, 2, 3, 3, 4, 4]})

# Transfoming data using a Function or Mapping

In [None]:
#We can add a new column in the existing DataFrame with the help of Mapping

In [None]:
#Code for creating dummy dataset
pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
                'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

# Replacing Values

In [None]:
# replace method provides a simpler and more flexible way to replace values

In [None]:
#Code for creating dummy dataset
pd.Series([1., -999., 2., -999., -1000., 3.])