# Mergin DataFrames

### Population DataFrame

In [1]:
import pandas as pd
population = pd.read_csv('pa_zipcode_population.csv')

population

Unnamed: 0,Zipcode,2010 Census Popuation
0,16855,282
1,15681,5241
2,18657,11985
3,17307,5899
4,15635,220


### Cities DataFrame


In [2]:
cities = pd.read_csv('pa_zipcode_city.csv') 

cities

Unnamed: 0,Zipcode,City,State
0,17545,MANHEIM,PA
1,18455,PRESTON PARK,PA
2,17307,BIGLERVILLE,PA
3,15705,INDIANA,PA
4,16833,CURWENSVILLE,PA
5,16220,CROWN,PA
6,18618,HARVEYS LAKE,PA
7,16855,MINERAL SPRINGS,PA
8,16623,CASSVILLE,PA
9,15635,HANNASTOWN,PA


### Merging

In [3]:
pd.merge(population, cities)

Unnamed: 0,Zipcode,2010 Census Popuation,City,State
0,16855,282,MINERAL SPRINGS,PA
1,15681,5241,SALTSBURG,PA
2,18657,11985,TUNKHANNOCK,PA
3,17307,5899,BIGLERVILLE,PA
4,15635,220,HANNASTOWN,PA


### Medal DataFrames


In [4]:
bronze = pd.read_csv('Summer Olympic medals/Bronze.csv')
gold = pd.read_csv('Summer Olympic medals/Gold.csv')
how
print(bronze.head(5))
print()
print(gold.head(5))

   NOC         Country   Total
0  USA   United States  1052.0
1  URS    Soviet Union   584.0
2  GBR  United Kingdom   505.0
3  FRA          France   475.0
4  GER         Germany   454.0

   NOC         Country   Total
0  USA   United States  2088.0
1  URS    Soviet Union   838.0
2  GBR  United Kingdom   498.0
3  FRA          France   378.0
4  GER         Germany   407.0


### Merging all columns


In [5]:
pd.merge(bronze,gold)

Unnamed: 0,NOC,Country,Total
0,ESP,Spain,92.0
1,IRL,Ireland,8.0
2,SYR,Syria,1.0
3,MOZ,Mozambique,1.0
4,SUR,Suriname,1.0
5,PAR,Paraguay,
6,SCG,Serbia,
7,NAM,Namibia,
8,SIN,Singapore,
9,SRI,Sri Lanka,


### Merging on

In [6]:
pd.merge(bronze, gold, on='NOC')

Unnamed: 0,NOC,Country_x,Total_x,Country_y,Total_y
0,USA,United States,1052.0,United States,2088.0
1,URS,Soviet Union,584.0,Soviet Union,838.0
2,GBR,United Kingdom,505.0,United Kingdom,498.0
3,FRA,France,475.0,France,378.0
4,GER,Germany,454.0,Germany,407.0
5,AUS,Australia,413.0,Australia,293.0
6,ITA,Italy,374.0,Italy,460.0
7,HUN,Hungary,345.0,Hungary,400.0
8,SWE,Sweden,325.0,Sweden,347.0
9,NED,Netherlands,320.0,Netherlands,212.0


### Merging on multiple columns

In [7]:
pd.merge(bronze, gold, on=['NOC', 'Country'])

Unnamed: 0,NOC,Country,Total_x,Total_y
0,USA,United States,1052.0,2088.0
1,URS,Soviet Union,584.0,838.0
2,GBR,United Kingdom,505.0,498.0
3,FRA,France,475.0,378.0
4,GER,Germany,454.0,407.0
5,AUS,Australia,413.0,293.0
6,ITA,Italy,374.0,460.0
7,HUN,Hungary,345.0,400.0
8,SWE,Sweden,325.0,347.0
9,NED,Netherlands,320.0,212.0


### Using suffixes


In [8]:
pd.merge(bronze, gold, on=['NOC', 'Country'], suffixes=['_bronze', '_gold'])

Unnamed: 0,NOC,Country,Total_bronze,Total_gold
0,USA,United States,1052.0,2088.0
1,URS,Soviet Union,584.0,838.0
2,GBR,United Kingdom,505.0,498.0
3,FRA,France,475.0,378.0
4,GER,Germany,454.0,407.0
5,AUS,Australia,413.0,293.0
6,ITA,Italy,374.0,460.0
7,HUN,Hungary,345.0,400.0
8,SWE,Sweden,325.0,347.0
9,NED,Netherlands,320.0,212.0


### Counties DataFrame


In [9]:
counties = pd.read_csv('pa_counties.csv') 

counties

Unnamed: 0,CITY NAME,COUNTY NAME
0,SALTSBURG,INDIANA
1,MINERAL SPRINGS,CLEARFIELD
2,BIGLERVILLE,ADAMS
3,HANNASTOWN,WESTMORELAND
4,TUNKHANNOCK,WYOMING


In [10]:
cities.tail()

Unnamed: 0,Zipcode,City,State
10,15681,SALTSBURG,PA
11,18657,TUNKHANNOCK,PA
12,15279,PITTSBURGH,PA
13,17231,LEMASTERS,PA
14,18821,GREAT BEND,PA


### Specifying columns to merge

In [11]:
pd.merge(counties, cities, left_on='CITY NAME', right_on='City')

Unnamed: 0,CITY NAME,COUNTY NAME,Zipcode,City,State
0,SALTSBURG,INDIANA,15681,SALTSBURG,PA
1,MINERAL SPRINGS,CLEARFIELD,16855,MINERAL SPRINGS,PA
2,BIGLERVILLE,ADAMS,17307,BIGLERVILLE,PA
3,HANNASTOWN,WESTMORELAND,15635,HANNASTOWN,PA
4,TUNKHANNOCK,WYOMING,18657,TUNKHANNOCK,PA


### Switching left/right DataFrames

In [12]:
pd.merge(cities, counties, left_on='City', right_on='CITY NAME')

Unnamed: 0,Zipcode,City,State,CITY NAME,COUNTY NAME
0,17307,BIGLERVILLE,PA,BIGLERVILLE,ADAMS
1,16855,MINERAL SPRINGS,PA,MINERAL SPRINGS,CLEARFIELD
2,15635,HANNASTOWN,PA,HANNASTOWN,WESTMORELAND
3,15681,SALTSBURG,PA,SALTSBURG,INDIANA
4,18657,TUNKHANNOCK,PA,TUNKHANNOCK,WYOMING


---
# Let’s practice!
---

In [42]:
revenue = pd.read_csv('revenue.csv')
managers = pd.read_csv('managers.csv')

In [35]:
# Merge revenue with managers on 'city': merge_by_city
merge_by_city = pd.merge(revenue,managers, on='city')

# Print merge_by_city
print(merge_by_city)

# Merge revenue with managers on 'branch_id': merge_by_id
merge_by_id = pd.merge(revenue,managers, on='branch_id')

# Print merge_by_id
print(merge_by_id)

   branch_id_x         city  revenue  branch_id_y   manager
0           10       Austin      100           10  Charlers
1           20       Denver       83           20      Joel
2           30  Springfield        4           31     Sally
3           47    Mendocino      200           47     Brett
   branch_id     city_x  revenue     city_y   manager
0         10     Austin      100     Austin  Charlers
1         20     Denver       83     Denver      Joel
2         47  Mendocino      200  Mendocino     Brett


---
```python
pd.merge(revenue, managers_b, on='city')

---------------------------------------------------------------------------
KeyError
KeyError: 'city'
```

In [38]:
managers = pd.read_csv('managers_b.csv')
# Merge revenue & managers on 'city' & 'branch': combined
combined = pd.merge(revenue,managers, left_on='city', right_on='branch')

# Print combined
print(combined)

   branch_id_x         city  revenue  branch_id_y       branch   manager
0           10       Austin      100           10       Austin  Charlers
1           20       Denver       83           20       Denver      Joel
2           30  Springfield        4           31  Springfield     Sally
3           47    Mendocino      200           47    Mendocino     Brett


In [45]:
# Add 'state' column to revenue: revenue['state']
revenue['state'] = ['TX', 'CO','IL','CA']

# Add 'state' column to managers: managers['state']
managers['state'] = ['TX', 'CO', 'CA', 'MO']

# Merge revenue & managers on 'branch_id', 'city', & 'state': combined
combined = pd.merge(revenue, managers, on=['branch_id','city', 'state'])

# Print combined
print(combined)

   branch_id       city  revenue state   manager
0         10     Austin      100    TX  Charlers
1         20     Denver       83    CO      Joel
2         47  Mendocino      200    CA     Brett


---
# Joining DataFrames
### Medal DataFrames


In [46]:
import pandas as pd
bronze = pd.read_csv('Summer Olympic medals/bronze_top5.csv')
gold = pd.read_csv('Summer Olympic medals/gold_top5.csv')

print(bronze)
print()
print(gold)

   NOC         Country  Total
0  USA   United States   1052
1  URS    Soviet Union    584
2  GBR  United Kingdom    505
3  FRA          France    475
4  GER         Germany    454

   NOC         Country  Total
0  USA   United States   2088
1  URS    Soviet Union    838
2  GBR  United Kingdom    498
3  ITA           Italy    460
4  GER         Germany    407


## Merging with inner join

- defaut: `how = 'inner'`
- extracts the rows in joining columns from both DF and it glues them together in the joined DF

In [47]:
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='inner') 

Unnamed: 0,NOC,Country,Total_bronze,Total_gold
0,USA,United States,1052,2088
1,URS,Soviet Union,584,838
2,GBR,United Kingdom,505,498
3,GER,Germany,454,407


### Merging with left join
-  Keeps all rows of the left DF in the merged DF

#### For rows in the left DF with matches in the right DF:
    - Non-joining columns of right DF are appended to left DF
 #### For rows in the left DF with no matches in the right DF:
    - Non-joining columns are filled with nulls

## Merging with left join

- `how = 'left'` 
bronze on the left, gold on the right, the row for france is retain  and a null balue is sinserterf into the `total_gold` col bc it has no France row

In [48]:
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='left')

Unnamed: 0,NOC,Country,Total_bronze,Total_gold
0,USA,United States,1052,2088.0
1,URS,Soviet Union,584,838.0
2,GBR,United Kingdom,505,498.0
3,FRA,France,475,
4,GER,Germany,454,407.0




## Merging with right join

`how = 'right`

- does a right join doing the same thing with the rols of left and right interchanged
- we retain all 5 rows from the gold DF 
- italy has a Nan  entry in the `total_bronze` col

In [49]:
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='right')

Unnamed: 0,NOC,Country,Total_bronze,Total_gold
0,USA,United States,1052.0,2088
1,URS,Soviet Union,584.0,838
2,GBR,United Kingdom,505.0,498
3,GER,Germany,454.0,407
4,ITA,Italy,,460


## Merging with outer join
- `how='outer'`
- union of all rows from left and right DF
- shows FRA not in top 5 gold countries
- ITA not in top 5 bronze countries

In [50]:
pd.merge(bronze, gold, on=['NOC', 'Country'],suffixes=['_bronze', '_gold'], how='outer')

Unnamed: 0,NOC,Country,Total_bronze,Total_gold
0,USA,United States,1052.0,2088.0
1,URS,Soviet Union,584.0,838.0
2,GBR,United Kingdom,505.0,498.0
3,FRA,France,475.0,
4,GER,Germany,454.0,407.0
5,ITA,Italy,,460.0


### Population & unemployment data

- two 5 row DF, indexed by Zip Code
- population
- unemployment
- only common row label is 2060

In [18]:
population = pd.read_csv('population_00.csv', index_col=0)
unemployment = pd.read_csv('unemployment_00.csv', index_col=0)

print(population)
print()
print(unemployment)

               2010 Census Population
Zip Code ZCTA                        
57538                             322
59916                             130
37660                           40038
2860                            45199

       unemployment  participants
Zip                              
2860           0.11         34447
46167          0.02          4800
1097           0.33            42
80808          0.07          4310


## Using .join(how=‘left’)

 `population.join(unemployment)`
- computes a left join using the Index by default
- only the row 2860 is complete

In [19]:
population.join(unemployment)

Unnamed: 0_level_0,2010 Census Population,unemployment,participants
Zip Code ZCTA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
57538,322,,
59916,130,,
37660,40038,,
2860,45199,0.11,34447.0


## Using .join(how=‘right’)
`population.join(unemployment, how='right')`

- 2860 row is preserved againg, but the other rows are extrated from the right `unemployment` DF with the left `population` DF values set to Nan


In [20]:
population.join(unemployment, how='right')

Unnamed: 0_level_0,2010 Census Population,unemployment,participants
Zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2860,45199.0,0.11,34447
46167,,0.02,4800
1097,,0.33,42
80808,,0.07,4310


## Using .join(how=‘inner’)

`population.join(unemployment, how='inner')`





In [21]:
population.join(unemployment, how='inner')

Unnamed: 0,2010 Census Population,unemployment,participants
2860,45199,0.11,34447


## Using .join(how=‘outer’)

`how='outer'` sorts the combined index


In [22]:
population.join(unemployment, how='outer')

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


### Which should you use?
- `df1.append(df2)`: stacking vertically
- `pd.concat([df1, df2])`:
    - stacking many horizontally or vertically
    - simple inner/outer joins on Indexes
- `df1.join(df2)`: inner/outer/le!/right joins on Indexes
- `pd.merge([df1, df2])`: many joins on multiple columns

---
# Let’s practice!

```python
# Merge revenue and sales: revenue_and_sales
revenue_and_sales = pd.merge(revenue, sales,on=['city', 'state'], how='right')

# Print revenue_and_sales
print(revenue_and_sales)

# Merge sales and managers: sales_and_managers
sales_and_managers = pd.merge(sales, managers,how='left', left_on=['city', 'state'], right_on=['branch', 'state'])

# Print sales_and_managers
print(sales_and_managers)
```

```python
# Perform the first merge: merge_default
merge_default = pd.merge(sales_and_managers, revenue_and_sales)

# Print merge_default
print(merge_default)

# Perform the second merge: merge_outer
merge_outer = pd.merge(sales_and_managers,revenue_and_sales, how='outer')

# Print merge_outer
print(merge_outer)

# Perform the third merge: merge_outer_on
merge_outer_on = pd.merge(sales_and_managers, revenue_and_sales, on=['city', 'state'], how='outer')

# Print merge_outer_on
print(merge_outer_on)
```

---
# Ordered merges

### Software & hardware sales


In [53]:
import pandas as pd

software = pd.read_csv('Sales/feb-sales-Software.csv', parse_dates=['Date']).sort_values('Date')

hardware = pd.read_csv('Sales/feb-sales-Hardware.csv', parse_dates=['Date']).sort_values('Date')

In [54]:
print(software)

                 Date          Company   Product  Units
2 2015-02-02 08:33:01            Hooli  Software      3
1 2015-02-03 14:14:18          Initech  Software     13
7 2015-02-04 15:36:29        Streeplex  Software     13
3 2015-02-05 01:53:06  Acme Coporation  Software     19
5 2015-02-09 13:09:55        Mediacore  Software      7
4 2015-02-11 20:03:08          Initech  Software      7
6 2015-02-11 22:50:44            Hooli  Software      4
0 2015-02-16 12:09:19            Hooli  Software     10
8 2015-02-21 05:01:26        Mediacore  Software      3


In [55]:
print(hardware)

                 Date          Company   Product  Units
3 2015-02-02 20:54:49        Mediacore  Hardware      9
0 2015-02-04 21:52:45  Acme Coporation  Hardware     14
1 2015-02-07 22:58:10  Acme Coporation  Hardware      1
2 2015-02-19 10:59:33        Mediacore  Hardware     16
4 2015-02-21 20:41:47            Hooli  Hardware      3


### Using merge()

- acctually doing an INNER join on all cols with matching names by default
- `Units` & `Date` cols have no overlapping values so the result is emplty

In [56]:
pd.merge(hardware, software)

Unnamed: 0,Date,Company,Product,Units


### Using merge(how=‘outer’)

no longer empty


In [57]:
pd.merge(hardware, software, how='outer')

Unnamed: 0,Date,Company,Product,Units
0,2015-02-02 20:54:49,Mediacore,Hardware,9
1,2015-02-04 21:52:45,Acme Coporation,Hardware,14
2,2015-02-07 22:58:10,Acme Coporation,Hardware,1
3,2015-02-19 10:59:33,Mediacore,Hardware,16
4,2015-02-21 20:41:47,Hooli,Hardware,3
5,2015-02-02 08:33:01,Hooli,Software,3
6,2015-02-03 14:14:18,Initech,Software,13
7,2015-02-04 15:36:29,Streeplex,Software,13
8,2015-02-05 01:53:06,Acme Coporation,Software,19
9,2015-02-09 13:09:55,Mediacore,Software,7


### Sorting merge(how='outer')

In [58]:
pd.merge(hardware, software, how='outer').sorted_values('Date')

AttributeError: 'DataFrame' object has no attribute 'sorted_values'

### Using merge_ordered()

what should've printed above 

- behaves like `.merge()` when cols can be ordered
- merge DF has rows sorted lexicographically according to the column orderings in the input DF
- the default join is an OUTER JOIN contrasting the default for `merge()` INNER JOIN

In [59]:
pd.merge_ordered(hardware, software)

Unnamed: 0,Date,Company,Product,Units
0,2015-02-02 08:33:01,Hooli,Software,3
1,2015-02-02 20:54:49,Mediacore,Hardware,9
2,2015-02-03 14:14:18,Initech,Software,13
3,2015-02-04 15:36:29,Streeplex,Software,13
4,2015-02-04 21:52:45,Acme Coporation,Hardware,14
5,2015-02-05 01:53:06,Acme Coporation,Software,19
6,2015-02-07 22:58:10,Acme Coporation,Hardware,1
7,2015-02-09 13:09:55,Mediacore,Software,7
8,2015-02-11 20:03:08,Initech,Software,7
9,2015-02-11 22:50:44,Hooli,Software,4


### Using on & suffixes

- accepts keyword arguments:
    - `on=[]`
    - `suffixes=[]`

In [61]:
pd.merge_ordered(hardware, software,
                 on=['Date', 'Company'],
                 suffixes=['_hardware', '_software']).head()

Unnamed: 0,Date,Company,Product_hardware,Units_hardware,Product_software,Units_software
0,2015-02-02 08:33:01,Hooli,,,Software,3.0
1,2015-02-02 20:54:49,Mediacore,Hardware,9.0,,
2,2015-02-03 14:14:18,Initech,,,Software,13.0
3,2015-02-04 15:36:29,Streeplex,,,Software,13.0
4,2015-02-04 21:52:45,Acme Coporation,Hardware,14.0,,


### Stocks data

In [62]:
stocks = pd.read_csv('stocks-2013.csv') 
stocks

Unnamed: 0,Date,AAPL,IBM,CSCO,MSFT
0,1/31/2013,497.822381,197.271905,20.699524,27.236667
1,2/28/2013,456.808953,200.735788,20.988947,27.704211
2,3/31/2013,441.840998,210.978001,21.335,28.141
3,4/30/2013,419.764998,204.733636,20.914545,29.870909
4,5/31/2013,446.45273,205.263639,22.386364,33.950909
5,6/30/2013,425.537999,200.85,24.3755,34.6325
6,7/31/2013,429.157272,194.354546,25.378636,33.650454
7,8/31/2013,484.843635,187.125,24.948636,32.485
8,9/30/2013,480.184499,188.767,24.08,32.5235
9,10/31/2013,504.744783,180.710002,22.847391,34.382174


### GDP data

In [67]:
gdp = pd.read_csv('gdp-2013.csv', parse_dates=True) 
gdp

Unnamed: 0,Date,GDP
0,3/31/2012,15973.9
1,6/30/2012,16121.9
2,9/30/2012,16227.9
3,12/31/2012,16297.3
4,3/31/2013,16475.4
5,6/30/2013,16541.4
6,9/30/2013,16749.3
7,12/31/2013,16999.9


### Ordered merge

In [64]:
 pd.merge_ordered(stocks, gdp, on='Date')

Unnamed: 0,Date,AAPL,IBM,CSCO,MSFT,GDP
0,1/31/2013,497.822381,197.271905,20.699524,27.236667,
1,10/31/2013,504.744783,180.710002,22.847391,34.382174,
2,11/30/2013,524.616499,181.333502,22.204,37.3625,
3,12/31/2012,,,,,16297.3
4,12/31/2013,559.657613,179.114763,21.257619,37.455715,16999.9
5,2/28/2013,456.808953,200.735788,20.988947,27.704211,
6,3/31/2012,,,,,15973.9
7,3/31/2013,441.840998,210.978001,21.335,28.141,16475.4
8,4/30/2013,419.764998,204.733636,20.914545,29.870909,
9,5/31/2013,446.45273,205.263639,22.386364,33.950909,


### Ordered merge with ffill

In [65]:
pd.merge_ordered(stocks, gdp, on='Date', fill_method='ffill') 

Unnamed: 0,Date,AAPL,IBM,CSCO,MSFT,GDP
0,1/31/2013,497.822381,197.271905,20.699524,27.236667,
1,10/31/2013,504.744783,180.710002,22.847391,34.382174,
2,11/30/2013,524.616499,181.333502,22.204,37.3625,
3,12/31/2012,524.616499,181.333502,22.204,37.3625,16297.3
4,12/31/2013,559.657613,179.114763,21.257619,37.455715,16999.9
5,2/28/2013,456.808953,200.735788,20.988947,27.704211,16999.9
6,3/31/2012,456.808953,200.735788,20.988947,27.704211,15973.9
7,3/31/2013,441.840998,210.978001,21.335,28.141,16475.4
8,4/30/2013,419.764998,204.733636,20.914545,29.870909,16475.4
9,5/31/2013,446.45273,205.263639,22.386364,33.950909,16475.4


---
# Let’s practice!

```python
# Perform the first ordered merge: tx_weather
tx_weather = pd.merge_ordered(austin,houston)

# Print tx_weather
print(tx_weather)

# Perform the second ordered merge: tx_weather_suff
tx_weather_suff = pd.merge_ordered(austin,houston,
on='date',suffixes=['_aus', '_hus'])

# Print tx_weather_suff
print(tx_weather_suff)

# Perform the third ordered merge: tx_weather_ffill
tx_weather_ffill = pd.merge_ordered(austin,houston,
on='date',suffixes=['_aus', '_hus'], fill_method='ffill')

# Print tx_weather_ffill
print(tx_weather_ffill)
```

```python
# Merge auto and oil: merged
merged = pd.merge_asof(auto,oil,
left_on='yr',
right_on='Date')

# Print the tail of merged
print(merged.tail())

# Resample merged: yearly
yearly = merged.resample('A', on='Date')[['mpg', 'Price']].mean() 

# Print yearly
print(yearly)

# print yearly.corr()
print(yearly.corr())
```