# Appending & concatenating Series

### append()
- `.append()`: Series & DataFrame method
-  Invocation:
    -  `s1.append(s2)`
    - Stacks rows of s2 below s1
- Method for Series & DataFrames

### concat()
-  `.concat()`: pandas module function
- Invocation:
    - `pd.concat([s1, s2, s3])`
    - Can stack row-wise or column-wise

### concat() & .append()
- Equivalence of concat() & .append():
- `result1 = pd.concat([s1, s2, s3])`
- `result2 = s1.append(s2).append(s3)`
- `result1` == `result2` elementwise

### Series of US states

In [1]:
import pandas as pd

northeast = pd.Series(['CT', 'ME', 'MA', 'NH', 'RI', 'VT','NJ', 'NY', 'PA'])

south = pd.Series(['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR', 'LA', 'OK', 'TX'])

midwest = pd.Series(['IL', 'IN', 'MN', 'MO', 'NE', 'ND', 'SD', 'IA', 'KS', 'MI', 'OH', 'WI'])
    
west = pd.Series(['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 'HI', 'OR','WA'])

### Using .append()

In [2]:
print(northeast, south)

0    CT
1    ME
2    MA
3    NH
4    RI
5    VT
6    NJ
7    NY
8    PA
dtype: object 0     DE
1     FL
2     GA
3     MD
4     NC
5     SC
6     VA
7     DC
8     WV
9     AL
10    KY
11    MS
12    TN
13    AR
14    LA
15    OK
16    TX
dtype: object


In [3]:
east = northeast.append(south)

east

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
0     DE
1     FL
2     GA
3     MD
4     NC
5     SC
6     VA
7     DC
8     WV
9     AL
10    KY
11    MS
12    TN
13    AR
14    LA
15    OK
16    TX
dtype: object

### The appended Index

In [4]:
east.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,  7,
             8,  9, 10, 11, 12, 13, 14, 15, 16],
           dtype='int64')

In [5]:
east.loc[3]

3    NH
3    MD
dtype: object

### Using .reset_index()


In [6]:
new_east = northeast.append(south).reset_index(drop=True)

new_east.head(11)

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
dtype: object

## Using concat()


In [7]:
east = pd.concat([northeast, south])

east.head(11)

0    CT
1    ME
2    MA
3    NH
4    RI
5    VT
6    NJ
7    NY
8    PA
0    DE
1    FL
dtype: object

In [8]:
east.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,  7,
             8,  9, 10, 11, 12, 13, 14, 15, 16],
           dtype='int64')

### Using ignore_index


In [9]:
new_east = pd.concat([northeast, south],
                    ignore_index=True)

new_east.head(11)

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
dtype: object

In [10]:
new_east.index

RangeIndex(start=0, stop=26, step=1)

---
# Let’s practice!

In [11]:
# Import pandas
import pandas as pd

# Load 'sales-jan-2015.csv' into a DataFrame: jan
jan = pd.read_csv('Sales/sales-jan-2015.csv', index_col='Date', parse_dates=True)

# Load 'sales-feb-2015.csv' into a DataFrame: feb
feb = pd.read_csv('Sales/sales-feb-2015.csv', index_col='Date', parse_dates=True)

# Load 'sales-mar-2015.csv' into a DataFrame: mar
mar = pd.read_csv('Sales/sales-mar-2015.csv', index_col='Date',parse_dates=True)

# Extract the 'Units' column from jan: jan_units
jan_units = jan['Units']

# Extract the 'Units' column from feb: feb_units
feb_units = feb['Units']

# Extract the 'Units' column from mar: mar_units
mar_units = mar['Units']

# Append feb_units and then mar_units to jan_units: quarter1
quarter1 = jan_units.append(feb_units).append(mar_units)

# Print the first slice from quarter1
print(quarter1.loc['jan 27, 2015':'feb 2, 2015'])

# Print the second slice from quarter1
print(quarter1.loc['feb 26, 2015': 'mar 7, 2015'])

# Compute & print total sales in quarter1
print(quarter1.sum())

Date
2015-01-27 07:11:55    18
2015-02-02 08:33:01     3
2015-02-02 20:54:49     9
Name: Units, dtype: int64
Date
2015-02-26 08:57:45     4
2015-02-26 08:58:51     1
2015-03-06 10:11:45    17
2015-03-06 02:03:56    17
Name: Units, dtype: int64
642


In [12]:
# Initialize empty list: units
units = []

# Build the list of Series
for month in [jan, feb, mar]:
    units.append(month['Units'])

# Concatenate the list: quarter1
quarter1 = pd.concat(units, axis='rows')

# Print slices from quarter1
print(quarter1.loc['jan 27, 2015':'feb 2, 2015'])
print(quarter1.loc['feb 26, 2015':'mar 7, 2015'])

Date
2015-01-27 07:11:55    18
2015-02-02 08:33:01     3
2015-02-02 20:54:49     9
Name: Units, dtype: int64
Date
2015-02-26 08:57:45     4
2015-02-26 08:58:51     1
2015-03-06 10:11:45    17
2015-03-06 02:03:56    17
Name: Units, dtype: int64


---
# Appending & concatenating DataFrames

### Loading population data

In [13]:
import pandas as pd

pop1 = pd.read_csv('population_01.csv', index_col=0)
pop2 = pd.read_csv('population_02.csv', index_col=0)

print(type(pop1),pop1.shape)
print(type(pop2),pop2.shape)

<class 'pandas.core.frame.DataFrame'> (4, 1)
<class 'pandas.core.frame.DataFrame'> (4, 1)


In [14]:
pop1

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670


In [15]:
pop2

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
12776,2180
76092,26669
98360,12221
49464,27481


### Appending population DataFrames

In [16]:
pop1.append(pop2)

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670
12776,2180
76092,26669
98360,12221
49464,27481


In [17]:
print(pop1.index.name, pop1.columns)
print(pop2.index.name, pop2.columns)

Zip Code ZCTA Index(['2010 Census Population'], dtype='object')
Zip Code ZCTA Index(['2010 Census Population'], dtype='object')


### Population & unemployment data

In [18]:
population =   pd.read_csv('population_00.csv', index_col=0)
unemployment = pd.read_csv('unemployment_00.csv', index_col=0)

print(population)
print(unemployment)

               2010 Census Population
Zip Code ZCTA                        
57538                             322
59916                             130
37660                           40038
2860                            45199
       unemployment  participants
Zip                              
2860           0.11         34447
46167          0.02          4800
1097           0.33            42
80808          0.07          4310


### Appending population & unemployment

```
population
appointment
```
Zipcode 2860 shows up twice

In [19]:
population.append(unemployment)

Unnamed: 0,2010 Census Population,participants,unemployment
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,34447.0,0.11
46167,,4800.0,0.02
1097,,42.0,0.33
80808,,4310.0,0.07


### Repeated index labels

### Concatenating rows

- `population.append(unemployment)` same as `pd.concat([population, unemployment], axis=0)`

`axis = 0`

In [20]:
pd.concat([population, unemployment], axis=0)

Unnamed: 0,2010 Census Population,participants,unemployment
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,34447.0,0.11
46167,,4800.0,0.02
1097,,42.0,0.33
80808,,4310.0,0.07


### Concatenating columns
-  concatenating by columns joins the duplicate data with missing rows.
`axis = 1`

In [21]:
pd.concat([population, unemployment], axis=1)

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


---
# Let’s practice!

In [22]:
columns= ['name','gender','count']
names_1881 = pd.read_csv('Baby names/names1881.csv',names=columns)
names_1981 = pd.read_csv('Baby names/names1981.csv', names=columns)

In [23]:
# Add 'year' column to names_1881 and names_1981
names_1881['year'] = 1881
names_1981['year'] = 1981

# Append names_1981 after names_1881 with ignore_index=True: combined_names
combined_names = names_1881.append(names_1981, ignore_index=True)

# Print shapes of names_1981, names_1881, and combined_names
print(names_1981.shape)
print(names_1881.shape)
print(combined_names.shape)

# Print all rows that contain the name 'Morgan'
print(combined_names.loc[combined_names['name'] == 'Morgan'])

(19455, 4)
(1935, 4)
(21390, 4)
         name gender  count  year
1283   Morgan      M     23  1881
2096   Morgan      F   1769  1981
14390  Morgan      M    766  1981


In [24]:
# # Concatenate weather_max and weather_mean horizontally: weather
# weather = pd.concat([weather_max, weather_mean], axis=1)

# # Print weather
# print(weather)

In [25]:
# medal_types = ['bronze', 'silver', 'gold']

# for medal in medal_types:

#     # Create the file name: file_name
#     file_name = "Summer Olympic medals/%s_top5.csv" % medal
    
#     # Create list of column names: columns
#     columns = ['Country', medal]
    
#     # Read file_name into a DataFrame: df
#     medal_df = pd.read_csv(file_name, header=0, index_col='Country', names=columns)

#     # Append medal_df to medals
#     medals.append(medal_df)

# # Concatenate medals horizontally: medals
# medals = pd.concat(medals, axis='columns')

# # Print medals
# print(medals)

---
# Concatenation, keys, & MultiIndexes

### Loading rainfall data

In [26]:
import pandas as pd

rain2013 = pd.read_csv('q1_rainfall_2013.csv', index_col='Month', parse_dates=True)
rain2014 = pd.read_csv('q1_rainfall_2014.csv', index_col='Month', parse_dates=True)
print(rain2013)
print(rain2014)

       Percipitation
Month               
Jan         0.096129
Feb         0.067143
Mar         0.061613
       Percipitation
Month               
Jan         0.050323
Feb         0.082143
Mar         0.070968


### Concatenating rows



In [27]:
pd.concat([rain2013, rain2014], axis=0)

Unnamed: 0_level_0,Percipitation
Month,Unnamed: 1_level_1
Jan,0.096129
Feb,0.067143
Mar,0.061613
Jan,0.050323
Feb,0.082143
Mar,0.070968


### Using multi-index on rows


In [28]:
rain1314 = pd.concat([rain2013, rain2014], keys=[2013, 2014], axis=0)

rain1314

Unnamed: 0_level_0,Unnamed: 1_level_0,Percipitation
Unnamed: 0_level_1,Month,Unnamed: 2_level_1
2013,Jan,0.096129
2013,Feb,0.067143
2013,Mar,0.061613
2014,Jan,0.050323
2014,Feb,0.082143
2014,Mar,0.070968


### Accessing a multi-index


In [29]:
rain1314.loc[2014]

Unnamed: 0_level_0,Percipitation
Month,Unnamed: 1_level_1
Jan,0.050323
Feb,0.082143
Mar,0.070968


### Concatenating columns

In [30]:
rain1314 = pd.concat([rain2013, rain2014], axis='columns')

rain1314

Unnamed: 0_level_0,Percipitation,Percipitation
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,0.096129,0.050323
Feb,0.067143,0.082143
Mar,0.061613,0.070968


### Using a multi-index on columns

In [31]:
rain1314 = pd.concat([rain2013, rain2014], keys=[2013, 2014], axis='columns')
rain1314

Unnamed: 0_level_0,2013,2014
Unnamed: 0_level_1,Percipitation,Percipitation
Month,Unnamed: 1_level_2,Unnamed: 2_level_2
Jan,0.096129,0.050323
Feb,0.067143,0.082143
Mar,0.061613,0.070968


In [47]:
# can be acess in a dictionary-style
rain1314[2013]

Unnamed: 0_level_0,Percipitation
Month,Unnamed: 1_level_1
Jan,0.096129
Feb,0.067143
Mar,0.061613


### pd.concat() with dict

-`dict = {key:value}`

`pd.concat(dict, axis='columns')` == `pd.concat([values,values,..], keys=[key, key,..], axis='columns')`

In [32]:
rain_dict = {2013: rain2013, 2014: rain2014}

rain1314 = pd.concat(rain_dict, axis='columns') 

rain1314

Unnamed: 0_level_0,2013,2014
Unnamed: 0_level_1,Percipitation,Percipitation
Month,Unnamed: 1_level_2,Unnamed: 2_level_2
Jan,0.096129,0.050323
Feb,0.067143,0.082143
Mar,0.061613,0.070968


---
# Let’s practice!

---

```python
for medal in medal_types:

    file_name = "%s_top5.csv" % medal
    
    # Read file_name into a DataFrame: medal_df
    medal_df = pd.read_csv(file_name,index_col='Country')
    
    # Append medal_df to medals
    medals.append(medal_df)
    
# Concatenate medals: medals
medals = pd.concat(medals, keys=['bronze', 'silver','gold'])

# Print medals in entirety
print(medals)

```

```python
# Sort the entries of medals: medals_sorted
medals_sorted = medals.sort_index(level=0)

# Print the number of Bronze medals won by Germany
print(medals_sorted.loc[('bronze','Germany')])

# Print data about silver medals
print(medals_sorted.loc['silver'])

# Create alias for pd.IndexSlice: idx
idx = pd.IndexSlice

# Print all the data on medals won by the United Kingdom
print(medals_sorted.loc[idx[:,'United Kingdom'], :])
```

```python
# Concatenate dataframes: february
february = pd.concat(dataframes, axis=1,keys=['Hardware', 'Software', 'Service'])

# Print february.info()
print(february.info())

# Assign pd.IndexSlice: idx
idx = pd.IndexSlice

# Create the slice: slice_2_8
slice_2_8 = february.loc['Feb 2, 2015':'Feb 8, 2015', idx[:, 'Company']]

# Print slice_2_8
print(slice_2_8)
```

```python
# Make the list of tuples: month_list
month_list = [('january',jan), ('february', feb),('march', mar)]

# Create an empty dictionary: month_dict
month_dict = {}

for month_name, month_data in month_list:

    # Group month_data: month_dict[month_name]
    month_dict[month_name] = month_data.groupby('Company').sum()

# Concatenate data in month_dict: sales
sales = pd.concat(month_dict)

# Print sales
print(sales)

# Print all sales by Mediacore
idx = pd.IndexSlice
print(sales.loc[idx[:, 'Mediacore'], :])
```

---
# Outer & inner joins

### Using with arrays

In [33]:
import pandas as pd
import numpy as np

A = np.arange(8).reshape(2,4) + 0.1
A

array([[0.1, 1.1, 2.1, 3.1],
       [4.1, 5.1, 6.1, 7.1]])

In [34]:
B = np.arange(6).reshape(2,3) + 0.2
B

array([[0.2, 1.2, 2.2],
       [3.2, 4.2, 5.2]])

In [35]:
C = np.arange(12).reshape(3,4) + 0.3
C

array([[ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

### Stacking arrays horizontally

In [36]:
np.hstack([B,A])

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [37]:
np.concatenate([B,A], axis=1)

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

### Stacking arrays vertically

In [38]:
np.vstack([A,C])

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [39]:
np.concatenate([A,C], axis=0)

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

### Incompatible array dimensions

```python
In [11]: np.concatenate([A, B], axis=0) # incompatible columns 
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
----> 1 np.concatenate([A, B], axis=0) # incompatible columns
ValueError: all the input array dimensions except for the concatenation axis must match
exactly

In [12]: np.concatenate([A, C], axis=1) # incompatible rows
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
----> 1 np.concatenate([A, C], axis=1) # incompatible rows
ValueError: all the input array dimensions except for the concatenation axis must match
exactly
```

### Population & unemployment data


In [40]:
population = pd.read_csv('population_00.csv', index_col=0)
unemployment = pd.read_csv('unemployment_00.csv', index_col=0)

print(population)
print(unemployment)

               2010 Census Population
Zip Code ZCTA                        
57538                             322
59916                             130
37660                           40038
2860                            45199
       unemployment  participants
Zip                              
2860           0.11         34447
46167          0.02          4800
1097           0.33            42
80808          0.07          4310


### Converting to arrays

In [41]:
population_array = np.array(population)
population_array  # Index info is lost

array([[  322],
       [  130],
       [40038],
       [45199]])

In [42]:
unemployment_array = np.array(unemployment)
unemployment_array

array([[1.1000e-01, 3.4447e+04],
       [2.0000e-02, 4.8000e+03],
       [3.3000e-01, 4.2000e+01],
       [7.0000e-02, 4.3100e+03]])

### Manipulating data as arrays

In [43]:
np.concatenate([population_array, unemployment_array],axis=1)

array([[3.2200e+02, 1.1000e-01, 3.4447e+04],
       [1.3000e+02, 2.0000e-02, 4.8000e+03],
       [4.0038e+04, 3.3000e-01, 4.2000e+01],
       [4.5199e+04, 7.0000e-02, 4.3100e+03]])

## Joins
- Joining tables: Combining rows of multiple tables

### Outer join
    - Union of index sets (all labels, no repetition)
    - Missing fields filled with NaN
### Inner join
    - Intersection of index sets (only common labels)

### Concatenation & inner join
 only the row label present in both dataframes undexes 
 - 2860 in this case
 
is preserved in the joined DataFrame

In [44]:
pd.concat([population, unemployment], axis=1, join='inner')

Unnamed: 0,2010 Census Population,unemployment,participants
2860,45199,0.11,34447


### Concatenation & outer join

- If we unspecified, the joint parameters defaults to `outer`
- all row indecies from the original two indexes exits in the jjoined DataFrame Index
- wjen a row occurs in one DataFrame but not the other, the missing columns entries are filled with nuymm values.

In [45]:
pd.concat([population, unemployment], axis=1, join='outer')

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


### Inner join on other axis

- DF is empty bc no column index label appears in both population and unemployment

In [46]:
pd.concat([population, unemployment], join='inner', axis=0)

57538
59916
37660
2860
2860
46167
1097
80808


---
# Let’s practice!

In [75]:
china.head()

Unnamed: 0_level_0,GDP
Year,Unnamed: 1_level_1
1960-01-01,59.184116
1961-01-01,49.55705
1962-01-01,46.685179
1963-01-01,50.097303
1964-01-01,59.062255


In [None]:
#load data
bronze = pd.read_csv('Summer Olympic medals/Bronze.csv', index_col='NOC')
silver = pd.read_csv('Summer Olympic medals/Silver.csv', index_col='NOC')
gold = pd.read_csv('Summer Olympic medals/Gold.csv', index_col='NOC')

In [79]:
china = pd.read_csv('GDP/gdp_china.csv', index_col='Year', parse_dates=True, names=['Year','China'])
us = pd.read_csv('GDP/gdp_usa.csv', index_col='DATE', names=['Year','US'])

ValueError: Index DATE invalid

In [81]:
!pandas --version

/bin/sh: 1: pandas: not found


In [73]:
# Create the list of DataFrames: medal_list
medal_list = [bronze,silver,gold]

# Concatenate medal_list horizontally using an inner join: medals
medals = pd.concat(medal_list,keys=['bronze','silver', 'gold'], axis=1, join='inner')

# Print medals
print(medals)


                    bronze                         silver          \
                   Country   Total                Country   Total   
NOC                                                                 
USA          United States  1052.0          United States  1195.0   
URS           Soviet Union   584.0           Soviet Union   627.0   
GBR         United Kingdom   505.0         United Kingdom   591.0   
FRA                 France   475.0                 France   461.0   
GER                Germany   454.0                Germany   350.0   
AUS              Australia   413.0              Australia   369.0   
ITA                  Italy   374.0                  Italy   394.0   
HUN                Hungary   345.0                Hungary   308.0   
SWE                 Sweden   325.0                 Sweden   349.0   
NED            Netherlands   320.0            Netherlands   250.0   
ROU                Romania   282.0                Romania   187.0   
JPN                  Japan   270.0

In [74]:
# Resample and tidy china: china_annual
china_annual = china.resample('A').pct_change(10).dropna()

# Resample and tidy us: us_annual
us_annual = us.resample('A').pct_change(10).dropna()

# Concatenate china_annual and us_annual: gdp
gdp = pd.concat([china_annual, us_annual],axis=1, join='inner')

# Resample gdp and print
print(gdp.resample('10A').last())

.resample() is now a deferred operation
You called pct_change(...) on this deferred object which materialized it into a dataframe
by implicitly taking the mean.  Use .resample(...).mean() instead
  from ipykernel import kernelapp as app


TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'