In [22]:
#Import dependencies
import pandas as pd
import numpy as np
import glob
import os
from sqlalchemy import create_engine

### Import & Merge County Data

In [2]:
#Import multiple CSV files into 1 dataframe
path= os.path.join("Resources", "CountyWeather")
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    #Add a new column that recognizes the name of the CSV file
    df['COUNTY'] = os.path.basename(filename)
    li.append(df)

county_df = pd.concat(li, axis=0, ignore_index=True)
county_df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,AWND_ATTRIBUTES,PRCP,PRCP_ATTRIBUTES,TAVG,TAVG_ATTRIBUTES,TMAX,TMAX_ATTRIBUTES,TMIN,TMIN_ATTRIBUTES,COUNTY
0,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,163.7,2013-01,,,,,45.4,",U",60.6,",,,U",30.3,",,,U",Monterey.csv
1,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,163.7,2013-02,,,,,48.5,",U",64.7,",,,U",32.2,",,,U",Monterey.csv
2,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,163.7,2013-03,,,,,58.2,",U",75.8,",,,U",40.6,",,,U",Monterey.csv
3,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,163.7,2013-04,,,,,63.5,",U",81.5,",,,U",45.6,",,,U",Monterey.csv
4,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,163.7,2013-05,,,,,67.2,",U",84.9,",,,U",49.4,",,,U",Monterey.csv


### Review & cleanse data

In [3]:
#Drop excess columns
county_df = county_df.drop(['ELEVATION','PRCP_ATTRIBUTES','TAVG_ATTRIBUTES','TMAX_ATTRIBUTES',
                            'TMIN_ATTRIBUTES','AWND_ATTRIBUTES'],1)
county_df.tail()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,AWND,PRCP,TAVG,TMAX,TMIN,COUNTY
78158,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2012-12,,8.21,,,,Alameda.csv
78159,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2013-01,,0.38,,,,Alameda.csv
78160,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2013-02,,0.5,,,,Alameda.csv
78161,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2013-03,,0.92,,,,Alameda.csv
78162,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2013-04,,1.39,,,,Alameda.csv


In [4]:
#Confirm datatypes for each column
county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78163 entries, 0 to 78162
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   STATION    78163 non-null  object 
 1   NAME       78163 non-null  object 
 2   LATITUDE   78163 non-null  float64
 3   LONGITUDE  78163 non-null  float64
 4   DATE       78163 non-null  object 
 5   AWND       6557 non-null   float64
 6   PRCP       59293 non-null  float64
 7   TAVG       43168 non-null  float64
 8   TMAX       43802 non-null  float64
 9   TMIN       43692 non-null  float64
 10  COUNTY     78163 non-null  object 
dtypes: float64(7), object(4)
memory usage: 6.6+ MB


In [5]:
#Check for null data
county_df.count()

STATION      78163
NAME         78163
LATITUDE     78163
LONGITUDE    78163
DATE         78163
AWND          6557
PRCP         59293
TAVG         43168
TMAX         43802
TMIN         43692
COUNTY       78163
dtype: int64

In [6]:
# Drop the .csv from the County column
county_df['COUNTY'] = county_df['COUNTY'].str.replace(r'\.csv', '')
county_df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,AWND,PRCP,TAVG,TMAX,TMIN,COUNTY
0,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-01,,,45.4,60.6,30.3,Monterey
1,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-02,,,48.5,64.7,32.2,Monterey
2,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-03,,,58.2,75.8,40.6,Monterey
3,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-04,,,63.5,81.5,45.6,Monterey
4,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-05,,,67.2,84.9,49.4,Monterey


In [7]:
# Collapse SD1 & SD2, LA1 & LA2
county_df['COUNTY'] = county_df['COUNTY'].str.replace(r'(_).*', '')
county_df.head()

# dataframe.column.str.replace("(::).*","")


Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,AWND,PRCP,TAVG,TMAX,TMIN,COUNTY
0,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-01,,,45.4,60.6,30.3,Monterey
1,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-02,,,48.5,64.7,32.2,Monterey
2,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-03,,,58.2,75.8,40.6,Monterey
3,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-04,,,63.5,81.5,45.6,Monterey
4,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-05,,,67.2,84.9,49.4,Monterey


### Drop unneeded data
AWND, TAVG & TMIN

In [8]:
#Drop columns
county_df = county_df.drop(['AWND','TAVG','TMIN',],1)
# county_df.tail()
county_df.count()

STATION      78163
NAME         78163
LATITUDE     78163
LONGITUDE    78163
DATE         78163
PRCP         59293
TMAX         43802
COUNTY       78163
dtype: int64

In [9]:
#Drop null data from PRCP
# county_df = county_df.dropna(subset = ['PRCP'])
# county_df.count()

## Calculate missing data & replace NaNs

### Calculate TMAX median

In [10]:
#Group the DF by County & Date, then calculate 
MED = county_df.groupby(['COUNTY','DATE'])[['TMAX']].apply(np.nanmedian)
MED.name = 'MEDIAN_TMAX'
county_df = county_df.join(MED, on=['COUNTY','DATE'])
county_df.head()

  r, k = function_base._ureduce(a, func=_nanmedian, axis=axis, out=out,


Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,PRCP,TMAX,COUNTY,MEDIAN_TMAX
0,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-01,,60.6,Monterey,60.75
1,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-02,,64.7,Monterey,62.05
2,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-03,,75.8,Monterey,67.6
3,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-04,,81.5,Monterey,69.35
4,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-05,,84.9,Monterey,73.7


### Calculate TMAX maximum

In [11]:
#Group the DF by County & Date, then calculate 
MAX = county_df.groupby(['COUNTY','DATE'])[['TMAX']].apply(np.nanmax)
MAX.name = 'MAXIM_TMAX'
county_df = county_df.join(MAX, on=['COUNTY','DATE'])
county_df.head()

  return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)


Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,PRCP,TMAX,COUNTY,MEDIAN_TMAX,MAXIM_TMAX
0,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-01,,60.6,Monterey,60.75,64.2
1,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-02,,64.7,Monterey,62.05,65.8
2,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-03,,75.8,Monterey,67.6,75.8
3,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-04,,81.5,Monterey,69.35,81.5
4,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-05,,84.9,Monterey,73.7,84.9


### Calculate PRCP mean

In [12]:
#Group the DF by County & Date, then calculate 
MEAN = county_df.groupby(['COUNTY','DATE'])[['PRCP']].apply(np.nanmean)
MEAN.name = 'MEAN_PRCP'
county_df = county_df.join(MEAN, on=['COUNTY','DATE'])
county_df.head()

  return libreduction.apply_frame_axis0(sdata, f, names, starts, ends)


Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,PRCP,TMAX,COUNTY,MEDIAN_TMAX,MAXIM_TMAX,MEAN_PRCP
0,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-01,,60.6,Monterey,60.75,64.2,1.1815
1,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-02,,64.7,Monterey,62.05,65.8,0.660526
2,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-03,,75.8,Monterey,67.6,75.8,0.725
3,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-04,,81.5,Monterey,69.35,81.5,0.407778
4,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-05,,84.9,Monterey,73.7,84.9,0.030588


### Calculate monthly precipitation

In [13]:
#Group the DF by County & Date, then calculate 
SUM = county_df.groupby(['COUNTY','DATE'])[['PRCP']].apply(np.nansum)
SUM.name = 'SUM_PRCP'
county_df = county_df.join(SUM, on=['COUNTY','DATE'])
county_df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,PRCP,TMAX,COUNTY,MEDIAN_TMAX,MAXIM_TMAX,MEAN_PRCP,SUM_PRCP
0,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-01,,60.6,Monterey,60.75,64.2,1.1815,23.63
1,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-02,,64.7,Monterey,62.05,65.8,0.660526,12.55
2,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-03,,75.8,Monterey,67.6,75.8,0.725,13.05
3,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-04,,81.5,Monterey,69.35,81.5,0.407778,7.34
4,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.8644,-120.8031,2013-05,,84.9,Monterey,73.7,84.9,0.030588,0.52


In [14]:
#Check data for one month to check the columns created properly
county_df[county_df["DATE"]=='2013-01']

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,PRCP,TMAX,COUNTY,MEDIAN_TMAX,MAXIM_TMAX,MEAN_PRCP,SUM_PRCP
0,USR0000CBDY,"BRADLEY CALIFORNIA, CA US",35.864400,-120.803100,2013-01,,60.6,Monterey,60.75,64.2,1.181500,23.63
84,US1CAMT0023,"BIG SUR 9.5 NNW, CA US",36.398021,-121.868511,2013-01,2.92,,Monterey,60.75,64.2,1.181500,23.63
144,US1CAMT0022,"CARMEL VALLEY VILLAGE 5.4 WNW, CA US",36.521800,-121.810000,2013-01,1.77,,Monterey,60.75,64.2,1.181500,23.63
162,USC00044555,"KING CITY, CA US",36.206900,-121.137700,2013-01,0.71,63.1,Monterey,60.75,64.2,1.181500,23.63
246,US1CAMT0021,"MARINA 0.8 SSE, CA US",36.673173,-121.784101,2013-01,0.69,,Monterey,60.75,64.2,1.181500,23.63
...,...,...,...,...,...,...,...,...,...,...,...,...
77799,USR0000COKS,"OAKLAND SOUTH CALIFORNIA, CA US",37.786100,-122.144700,2013-01,,54.0,Alameda,56.00,60.5,0.555625,8.89
77895,USR0000CCLV,"CALAVERAS ROAD CALIFORNIA, CA US",37.553100,-121.843900,2013-01,,52.0,Alameda,56.00,60.5,0.555625,8.89
77990,USC00046144,"NEWARK, CA US",37.514700,-122.032500,2013-01,0.27,56.3,Alameda,56.00,60.5,0.555625,8.89
78086,US1CAAL0007,"FREMONT 2.6 ESE, CA US",37.515049,-121.952305,2013-01,0.43,,Alameda,56.00,60.5,0.555625,8.89


### Export CSV

In [15]:
county_df.to_csv("Resources/county_clean.csv")

### Merge & Export CSV

In [16]:
wildfire_df = pd.read_csv("Resources/wildfire_clean.csv")
wildfire_df.head()

Unnamed: 0.1,Unnamed: 0,AcresBurned,ArchiveYear,Counties,CountyIds,CrewsInvolved,Fatalities,Latitude,Longitude,Name,PersonnelInvolved,Started,StructuresDamaged,StructuresDestroyed,UniqueId,Year,Month,Year-Month
0,0,257314.0,2013,Tuolumne,55,,,37.857,-120.086,Rim Fire,,2013-08-17 15:25:00+00:00,,,5fb18d4d-213f-4d83-a179-daaf11939e78,2013,8,2013-08
1,1,30274.0,2013,Los Angeles,19,,,34.585595,-118.423176,Powerhouse Fire,,2013-05-30 15:28:00+00:00,,,bf37805e-1cc2-4208-9972-753e47874c87,2013,5,2013-05
2,2,27531.0,2013,Riverside,33,,,33.7095,-116.72885,Mountain Fire,,2013-07-15 13:43:00+00:00,,,a3149fec-4d48-427c-8b2c-59e8b79d59db,2013,7,2013-07
3,3,27440.0,2013,Placer,31,,,39.12,-120.65,American Fire,,2013-08-10 16:30:00+00:00,,,8213f5c7-34fa-403b-a4bc-da2ace6e6625,2013,8,2013-08
4,4,24251.0,2013,Ventura,56,47.0,,0.0,0.0,Springs Fire,2167.0,2013-05-02 07:01:00+00:00,6.0,10.0,46731fb8-3350-4920-bdf7-910ac0eb715c,2013,5,2013-05


In [17]:
#Join wildfire & county data on COUNTY NAME & DATE
# merged_df = left_df.merge(right_df, how='inner', left_on=["A", "B"], right_on=["A2","B2"])

wildfire_county_df = wildfire_df.merge(county_df, left_on=['Counties','Year-Month'], right_on=['COUNTY', 'DATE'])
wildfire_county_df.head()

Unnamed: 0.1,Unnamed: 0,AcresBurned,ArchiveYear,Counties,CountyIds,CrewsInvolved,Fatalities,Latitude,Longitude,Name,...,LATITUDE,LONGITUDE,DATE,PRCP,TMAX,COUNTY,MEDIAN_TMAX,MAXIM_TMAX,MEAN_PRCP,SUM_PRCP
0,0,257314.0,2013,Tuolumne,55,,,37.857,-120.086,Rim Fire,...,37.8769,-119.3436,2013-08,0.59,68.3,Tuolumne,68.3,68.3,0.59,0.59
1,26,1070.0,2013,Tuolumne,55,,,38.25108,-120.02778,Power Fire,...,37.8769,-119.3436,2013-08,0.59,68.3,Tuolumne,68.3,68.3,0.59,0.59
2,2,27531.0,2013,Riverside,33,,,33.7095,-116.72885,Mountain Fire,...,33.9511,-117.388,2013-07,0.2,92.6,Riverside,96.6,107.6,0.217778,5.88
3,2,27531.0,2013,Riverside,33,,,33.7095,-116.72885,Mountain Fire,...,34.0178,-116.1878,2013-07,,92.1,Riverside,96.6,107.6,0.217778,5.88
4,2,27531.0,2013,Riverside,33,,,33.7095,-116.72885,Mountain Fire,...,34.0608,-114.7322,2013-07,,107.1,Riverside,96.6,107.6,0.217778,5.88


In [18]:
 wildfire_county_df.count()

Unnamed: 0             17423
AcresBurned            17423
ArchiveYear            17423
Counties               17423
CountyIds              17423
CrewsInvolved           1595
Fatalities               105
Latitude               17423
Longitude              17423
Name                   17423
PersonnelInvolved       1943
Started                17423
StructuresDamaged        525
StructuresDestroyed     1356
UniqueId               17423
Year                   17423
Month                  17423
Year-Month             17423
STATION                17423
NAME                   17423
LATITUDE               17423
LONGITUDE              17423
DATE                   17423
PRCP                   12687
TMAX                   10222
COUNTY                 17423
MEDIAN_TMAX            17355
MAXIM_TMAX             17355
MEAN_PRCP              17422
SUM_PRCP               17423
dtype: int64

In [19]:
wildfire_county_df.to_csv("Resources/wildfire_county_clean.csv")