# Reading multiple data files

## Tools for pandas data import
- pd.read_csv() for CSV files
    - dataframe = pd.read_csv(filepath)
    - dozens of optional input parameters
- Other data import tools:
    - pd.read_excel()
    - pd.read_html()
    - pd.read_json()

### Loading separate files


In [1]:
import pandas as pd

dataframe0 = pd.read_csv('Sales/sales-jan-2015.csv')

dataframe0 = pd.read_csv('Sales/sales-feb-2015.csv')

### Using a loop


In [2]:
filenames = ['Sales/sales-jan-2015.csv', 'Sales/sales-feb-2015.csv']

dataframes = []
for f in filenames:
    dataframes.append(pd.read_csv(f))

### Using a comprehension


In [3]:
filenames = ['Sales/sales-jan-2015.csv', 'Sales/sales-feb-2015.csv']

dataframes = [pd.read_csv(f) for f in filenames]


### Using glob


In [4]:
from glob import glob

filename = glob('Sales/sales*.csv')

dataframes = [pd.read_csv(f) for f in filenames]

# Let’s practice!

In [5]:
# Import pandas
import pandas as pd

# Read 'Bronze.csv' into a DataFrame: bronze
bronze = pd.read_csv('Summer Olympic medals/Bronze.csv')

# Read 'Silver.csv' into a DataFrame: silver
silver = pd.read_csv('Summer Olympic medals/Silver.csv')

# Read 'Gold.csv' into a DataFrame: gold
gold = pd.read_csv('Summer Olympic medals/Gold.csv')

# Print the first five rows of gold
print(gold.head())

   NOC         Country   Total
0  USA   United States  2088.0
1  URS    Soviet Union   838.0
2  GBR  United Kingdom   498.0
3  FRA          France   378.0
4  GER         Germany   407.0


In [6]:
# Import pandas
import pandas as pd

# Create the list of file names: filenames
filenames = ['Summer Olympic medals/Gold.csv', 'Summer Olympic medals/Silver.csv', 'Summer Olympic medals/Bronze.csv']

# Create the list of three DataFrames: dataframes
dataframes = []
for filename in filenames:
    dataframes.append(pd.read_csv(filename))

# Print top 5 rows of 1st DataFrame in dataframes
print(dataframes[0].head())

   NOC         Country   Total
0  USA   United States  2088.0
1  URS    Soviet Union   838.0
2  GBR  United Kingdom   498.0
3  FRA          France   378.0
4  GER         Germany   407.0


In [7]:
# Import pandas
import pandas as pd

# Make a copy of gold: medals
medals = gold.copy()

# Create list of new column labels: new_labels
new_labels = ['NOC', 'Country', 'Gold']

# Rename the columns of medals using new_labels
medals.columns = new_labels

# Add columns 'Silver' & 'Bronze' to medals
medals['Silver'] = silver['Total']
medals['Bronze'] = bronze['Total']

# Print the head of medals
print(medals.head())

   NOC         Country    Gold  Silver  Bronze
0  USA   United States  2088.0  1195.0  1052.0
1  URS    Soviet Union   838.0   627.0   584.0
2  GBR  United Kingdom   498.0   591.0   505.0
3  FRA          France   378.0   461.0   475.0
4  GER         Germany   407.0   350.0   454.0


---
# Reindexing DataFrames

### “Indexes” vs. “Indices”
- indices: many index labels within Index data structures
- indexes: many pandas Index data structures

### Importing weather data
```python
In [1]: import pandas as pd
In [2]: w_mean = pd.read_csv('quarterly_mean_temp.csv', index_col='Month')
In [3]: w_max = pd.read_csv('quarterly_max_temp.csv', index_col='Month')
    ```

### Examining the data

```python
In [4]: print(w_mean)
         Mean TemperatureF
Month
Apr      61.956044
Jan      32.133333
Jul      68.934783
Oct      43.434783
In [5]: print(w_max)
        Max TemperatureF
Month
Jan          68
Apr          89
Jul          91
Oct          84
```

### The DataFrame indexes

```python
In [6]: print(w_mean.index)
Index(['Apr', 'Jan', 'Jul', 'Oct'], dtype='object', name='Month')
In [7]: print(w_max.index)
Index(['Jan', 'Apr', 'Jul', 'Oct'], dtype='object', name='Month')
In [8]: print(type(w_mean.index))
<class 'pandas.indexes.base.Index'>
```

### Using` .reindex()`


```python
In [9]: ordered = ['Jan', 'Apr', 'Jul', 'Oct']
In [10]: w_mean2 = w_mean.reindex(ordered)

In [11]: print(w_mean2)
             Mean TemperatureF
Month
Jan          32.133333
Apr          61.956044
Jul          68.934783
Oct          43.434783   
 ```


### Using .sort_index()



```python
In [12]: w_mean2.sort_index()
Out[12]:
            Mean TemperatureF
Month
Apr         61.956044
Jan         32.133333
Jul         68.934783

```

### Reindex from a DataFrame Index


```python
In [13]: w_mean.reindex(w_max.index)
Out[13]:
          Mean TemperatureF
Month
Jan        32.133333
Apr        61.956044
Jul        68.934783
Oct        43.434783
```

### Reindexing with missing labels


```python
In [14]: w_mean3 = w_mean.reindex(['Jan', 'Apr', 'Dec'])
In [15]: print(w_mean3)
          Mean TemperatureF
Month
Jan       32.133333
Apr       61.956044
Dec       NaN
```

### Reindex from a DataFrame Index



```python
In [16]: w_max.reindex(w_mean3.index)
Out[16]:
       Max TemperatureF
Month
Jan    68.0
Apr    89.0
Dec    NaN
In [17]: w_max.reindex(w_mean3.index).dropna()
Out[17]:
        Max TemperatureF
Month
Jan     68.0
Apr     89.0
```

### Order ma!ers

```python
In [18]: w_max.reindex(w_mean.index)
Out[18]:
         Max TemperatureF
Month
Apr         89
Jan         68
Jul         91
Oct         84
In [19]: w_mean.reindex(w_max.index)
Out[19]:
           Mean TemperatureF
Month
Jan         32.133333
Apr         61.956044
Jul         68.934783
Oct         43.434783
```

---
# Let’s practice!

```python
# Import pandas
import pandas as pd

# Read 'monthly_max_temp.csv' into a DataFrame: weather1
weather1 = pd.read_csv('monthly_max_temp.csv', index_col='Month')

# Print the head of weather1
print(weather1.head())

# Sort the index of weather1 in alphabetical order: weather2
weather2 = weather1.sort_index()

# Print the head of weather2
print(weather2.head())

# Sort the index of weather1 in reverse alphabetical order: weather3
weather3 = weather1.sort_index(ascending=False)

# Print the head of weather3
print(weather3.head())

# Sort weather1 numerically using the values of 'Max TemperatureF': weather4
weather4 = weather1.sort_values('Max TemperatureF')

# Print the head of weather4
print(weather4.head())
```

```python
# Import pandas
import pandas as pd

# Reindex weather1 using the list year: weather2
weather2 = weather1.reindex(year)

# Print weather2
print(weather2)

# Reindex weather1 using the list year with forward-fill: weather3
weather3 = weather1.reindex(year).ffill()

# Print weather3
print(weather3)
```

```python
# Import pandas
import pandas as pd

# Reindex names_1981 with index of names_1881: common_names
common_names = names_1981.reindex(names_1881.index)

# Print shape of common_names
print(common_names.shape)

# Drop rows with null counts: common_names
common_names = common_names.dropna()

# Print shape of new common_names
print(common_names.shape)
```

# Arithmetic with Series & DataFrames

### Loading weather data

In [8]:
weather = pd.read_csv('pittsburgh2013.csv',
                     index_col='Date',
                     parse_dates=True)
weather.head(5)

Unnamed: 0_level_0,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,MeanDew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,Max Sea Level PressureIn,...,Max VisibilityMiles,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-01,32,28,21,30,27,16,100,89,77,30.1,...,10,6,2,10,8,,0.0,8,Snow,277
2013-01-02,25,21,17,14,12,10,77,67,55,30.27,...,10,10,10,14,5,,0.0,4,,272
2013-01-03,32,24,16,19,15,9,77,67,56,30.25,...,10,10,10,17,8,26.0,0.0,3,,229
2013-01-04,30,28,27,21,19,17,75,68,59,30.28,...,10,10,6,23,16,32.0,0.0,4,,250
2013-01-05,34,30,25,23,20,16,75,68,61,30.42,...,10,10,10,16,10,23.0,0.21,5,,221


In [9]:
# PrecipitationIn values from July 1 - July 7 2013

weather.loc['2013-7-1':'2013-7-7', 'PrecipitationIn']

Date
2013-07-01    0.18
2013-07-02    0.14
2013-07-03    0.00
2013-07-04    0.25
2013-07-05    0.02
2013-07-06    0.06
2013-07-07    0.10
Name: PrecipitationIn, dtype: float64

### Scalar multiplication

In [10]:
# PrecipitationIn values from July 1 - July 7 2013
# multiply values by 2.54

weather.loc['2013-7-1':'2013-7-7', 'PrecipitationIn']* 2.54

Date
2013-07-01    0.4572
2013-07-02    0.3556
2013-07-03    0.0000
2013-07-04    0.6350
2013-07-05    0.0508
2013-07-06    0.1524
2013-07-07    0.2540
Name: PrecipitationIn, dtype: float64

### Absolute temperature range

In [11]:
# Min temp Max temp values from July 1 - July 7 2013

week1_range = weather.loc['2013-07-01':'2013-07-07',['Min TemperatureF',
                                                     'Max TemperatureF']] 

week1_range

Unnamed: 0_level_0,Min TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-07-01,66,79
2013-07-02,66,84
2013-07-03,71,86
2013-07-04,70,86
2013-07-05,69,86
2013-07-06,70,89
2013-07-07,70,77


### Average temperature

In [12]:
# Mean temp values from July 1 - July 7 2013

week1_mean = weather.loc['2013-07-01':'2013-07-07',
                                 'Mean TemperatureF']

week1_mean

Date
2013-07-01    72
2013-07-02    74
2013-07-03    78
2013-07-04    77
2013-07-05    76
2013-07-06    78
2013-07-07    72
Name: Mean TemperatureF, dtype: int64

### Relative temperature range

In [13]:
## Divide the range values by the mean values
## WRONG

week1_range / week1_mean

  return this.join(other, how=how, return_indexers=return_indexers)


Unnamed: 0_level_0,2013-07-01 00:00:00,2013-07-02 00:00:00,2013-07-03 00:00:00,2013-07-04 00:00:00,2013-07-05 00:00:00,2013-07-06 00:00:00,2013-07-07 00:00:00,Min TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-07-01,,,,,,,,,
2013-07-02,,,,,,,,,
2013-07-03,,,,,,,,,
2013-07-04,,,,,,,,,
2013-07-05,,,,,,,,,
2013-07-06,,,,,,,,,
2013-07-07,,,,,,,,,


### Relative temperature range

In [14]:
# .divide by mean values axis = rows

# [mintemp/mean] by columns
week1_range.divide(week1_mean,axis='rows')

Unnamed: 0_level_0,Min TemperatureF,Max TemperatureF
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-07-01,0.916667,1.097222
2013-07-02,0.891892,1.135135
2013-07-03,0.910256,1.102564
2013-07-04,0.909091,1.116883
2013-07-05,0.907895,1.131579
2013-07-06,0.897436,1.141026
2013-07-07,0.972222,1.069444


### Percentage changes

In [15]:
week1_mean.pct_change() * 100

Date
2013-07-01         NaN
2013-07-02    2.777778
2013-07-03    5.405405
2013-07-04   -1.282051
2013-07-05   -1.298701
2013-07-06    2.631579
2013-07-07   -7.692308
Name: Mean TemperatureF, dtype: float64

### Bronze Olympic medals

In [16]:
bronze = pd.read_csv('Summer Olympic medals/bronze_top5.csv', index_col=0)

bronze

Unnamed: 0_level_0,Country,Total
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,United States,1052
URS,Soviet Union,584
GBR,United Kingdom,505
FRA,France,475
GER,Germany,454


### Silver Olympic medals

In [17]:
silver = pd.read_csv('Summer Olympic medals/silver_top5.csv', index_col=0)

silver

Unnamed: 0_level_0,Country,Total
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,United States,1195
USR,Soviet Union,627
GBR,United Kingdom,591
FRA,France,461
ITA,Italy,394


### Gold Olympic medals

In [18]:
gold = pd.read_csv('Summer Olympic medals/gold_top5.csv', index_col=0)

gold

Unnamed: 0_level_0,Country,Total
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,United States,2088
URS,Soviet Union,838
GBR,United Kingdom,498
ITA,Italy,460
GER,Germany,407


### Adding bronze, silver

In [19]:
bronze + silver

Unnamed: 0_level_0,Country,Total
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1
FRA,FranceFrance,936.0
GBR,United KingdomUnited Kingdom,1096.0
GER,,
ITA,,
URS,,
USA,United StatesUnited States,2247.0
USR,,


In [20]:
bronze.loc['USA']

Country    United States
Total               1052
Name: USA, dtype: object

In [21]:
silver.loc['USA']

Country    United States
Total               1195
Name: USA, dtype: object

### Using the .add() method

In [22]:
bronze.add(silver)

Unnamed: 0_level_0,Country,Total
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1
FRA,FranceFrance,936.0
GBR,United KingdomUnited Kingdom,1096.0
GER,,
ITA,,
URS,,
USA,United StatesUnited States,2247.0
USR,,


### Using a fill_value

In [23]:
#bronze.add(silver, fill_value=0)

### Adding bronze, silver, gold

In [24]:
bronze + silver + gold

Unnamed: 0_level_0,Country,Total
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1
FRA,,
GBR,United KingdomUnited KingdomUnited Kingdom,1594.0
GER,,
ITA,,
URS,,
USA,United StatesUnited StatesUnited States,4335.0
USR,,


### Chaining .add()

In [25]:
#bronze.add(silver, fill_value=0).add(gold, fill_value=0)

---
# Let’s practice!

In [26]:
# Extract selected columns from weather as new DataFrame: temps_f
temps_f = weather[['Min TemperatureF', 'Mean TemperatureF', 'Max TemperatureF']]

# Convert temps_f to celsius: temps_c
temps_c = (temps_f -32) * 5/9

# Rename 'F' in column names with 'C': temps_c.columns
temps_c.columns = temps_c.columns.str.replace('F', 'C')

# Print first 5 rows of temps_c
print(temps_c.head())

            Min TemperatureC  Mean TemperatureC  Max TemperatureC
Date                                                             
2013-01-01         -6.111111          -2.222222          0.000000
2013-01-02         -8.333333          -6.111111         -3.888889
2013-01-03         -8.888889          -4.444444          0.000000
2013-01-04         -2.777778          -2.222222         -1.111111
2013-01-05         -3.888889          -1.111111          1.111111


In [27]:
import pandas as pd

# Read 'GDP.csv' into a DataFrame: gdp
gdp = pd.read_csv('GDP/gdp_usa.csv',index_col='DATE', parse_dates=True)

# Slice all the gdp data from 2008 onward: post2008
post2008 = gdp.loc['2008':]

# Print the last 8 rows of post2008
print(post2008.tail(8))

# Resample post2008 by year, keeping last(): yearly
yearly = post2008.resample('A').last()

# Print yearly
print(yearly)

# Compute percentage growth of yearly: yearly['growth']
yearly['growth'] = yearly.pct_change() * 100

# Print yearly again
print(yearly)

              VALUE
DATE               
2014-07-01  17569.4
2014-10-01  17692.2
2015-01-01  17783.6
2015-04-01  17998.3
2015-07-01  18141.9
2015-10-01  18222.8
2016-01-01  18281.6
2016-04-01  18436.5
              VALUE
DATE               
2008-12-31  14549.9
2009-12-31  14566.5
2010-12-31  15230.2
2011-12-31  15785.3
2012-12-31  16297.3
2013-12-31  16999.9
2014-12-31  17692.2
2015-12-31  18222.8
2016-12-31  18436.5
              VALUE    growth
DATE                         
2008-12-31  14549.9       NaN
2009-12-31  14566.5  0.114090
2010-12-31  15230.2  4.556345
2011-12-31  15785.3  3.644732
2012-12-31  16297.3  3.243524
2013-12-31  16999.9  4.311144
2014-12-31  17692.2  4.072377
2015-12-31  18222.8  2.999062
2016-12-31  18436.5  1.172707


In [28]:
!ls 

01Preparing_data.ipynb		    pa_zipcode_population.csv
02Concatenating_data.ipynb	    pittsburgh2013.csv
03Merging_data.ipynb		    population_00.csv
04Case Study_Summer_Olympics.ipynb  population_01.csv
automobiles.csv			    population_02.csv
Baby names			    q1_rainfall_2013.csv
exchange.csv			    q1_rainfall_2014.csv
GDP				    revenue.csv
gdp-2013.csv			    Sales
managers_b.csv			    sp500.csv
managers.csv			    stocks-2013.csv
oil_price.csv			    Summer Olympic medals
pa_counties.csv			    unemployment_00.csv
pa_zipcode_city.csv


In [29]:
# Import pandas
import pandas as pd

# Read 'sp500.csv' into a DataFrame: sp500
sp500 = pd.read_csv('sp500.csv', index_col='Date', parse_dates=True)

# Read 'exchange.csv' into a DataFrame: exchange
exchange = pd.read_csv('exchange.csv', index_col='Date', parse_dates=True)

# Subset 'Open' & 'Close' columns from sp500: dollars
dollars = sp500[['Open', 'Close']]

# Print the head of dollars
print(dollars.head())

# Convert dollars to pounds: pounds
pounds = dollars.multiply(exchange['GBP/USD'], axis='rows')

# Print the head of pounds
print(pounds.head())

                   Open        Close
Date                                
2015-01-02  2058.899902  2058.199951
2015-01-05  2054.439941  2020.579956
2015-01-06  2022.150024  2002.609985
2015-01-07  2005.550049  2025.900024
2015-01-08  2030.609985  2062.139893
                   Open        Close
Date                                
2015-01-02  1340.364425  1339.908750
2015-01-05  1348.616555  1326.389506
2015-01-06  1332.515980  1319.639876
2015-01-07  1330.562125  1344.063112
2015-01-08  1343.268811  1364.126161
