In [1]:
#Import dependencies
import pandas as pd
import glob
import os

### Import & Merge County Data

In [9]:
#Import multiple CSV files into 1 dataframe
path= os.path.join("Resources", "CountyWeather")
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    #Add a new column that recognizes the name of the CSV file
    df['COUNTY'] = os.path.basename(filename)
    li.append(df)

county_df = pd.concat(li, axis=0, ignore_index=True)
county_df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,PRCP,PRCP_ATTRIBUTES,TAVG,TAVG_ATTRIBUTES,TMAX,TMAX_ATTRIBUTES,TMIN,TMIN_ATTRIBUTES,COUNTY,AWND,AWND_ATTRIBUTES
0,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,1722.1,2013-01,,,21.6,",U",41.1,",,,U",2.2,",,,U",LASSEN_COUNTY.csv,,
1,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,1722.1,2013-02,,,28.6,",U",47.7,",,,U",9.4,",,,U",LASSEN_COUNTY.csv,,
2,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,1722.1,2013-03,,,37.4,",U",53.1,",,,U",21.7,",,,U",LASSEN_COUNTY.csv,,
3,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,1722.1,2013-04,,,43.0,",U",59.1,",,,U",26.8,",,,U",LASSEN_COUNTY.csv,,
4,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,1722.1,2013-05,,,48.5,",U",66.1,",,,U",31.0,",,,U",LASSEN_COUNTY.csv,,


In [10]:
#Drop excess columns
county_df = county_df.drop(['ELEVATION','PRCP_ATTRIBUTES','TAVG_ATTRIBUTES','TMAX_ATTRIBUTES',
                            'TMIN_ATTRIBUTES','AWND_ATTRIBUTES'],1)
county_df.tail()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,PRCP,TAVG,TMAX,TMIN,COUNTY,AWND
78158,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2012-12,8.21,,,,Alameda.csv,
78159,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2013-01,0.38,,,,Alameda.csv,
78160,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2013-02,0.5,,,,Alameda.csv,
78161,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2013-03,0.92,,,,Alameda.csv,
78162,US1CAAL0008,"BERKELEY 3.8 ESE, CA US",37.855675,-122.230053,2013-04,1.39,,,,Alameda.csv,


In [11]:
#Confirm datatypes for each column
county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78163 entries, 0 to 78162
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   STATION    78163 non-null  object 
 1   NAME       78163 non-null  object 
 2   LATITUDE   78163 non-null  float64
 3   LONGITUDE  78163 non-null  float64
 4   DATE       78163 non-null  object 
 5   PRCP       59293 non-null  float64
 6   TAVG       43168 non-null  float64
 7   TMAX       43802 non-null  float64
 8   TMIN       43692 non-null  float64
 9   COUNTY     78163 non-null  object 
 10  AWND       6557 non-null   float64
dtypes: float64(7), object(4)
memory usage: 6.6+ MB


In [12]:
#Check for null data
county_df.count()

STATION      78163
NAME         78163
LATITUDE     78163
LONGITUDE    78163
DATE         78163
PRCP         59293
TAVG         43168
TMAX         43802
TMIN         43692
COUNTY       78163
AWND          6557
dtype: int64

In [13]:
# Drop the .csv from the County column
county_df['COUNTY'] = county_df['COUNTY'].str.replace(r'\.csv', '')
county_df.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,PRCP,TAVG,TMAX,TMIN,COUNTY,AWND
0,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-01,,21.6,41.1,2.2,LASSEN_COUNTY,
1,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-02,,28.6,47.7,9.4,LASSEN_COUNTY,
2,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-03,,37.4,53.1,21.7,LASSEN_COUNTY,
3,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-04,,43.0,59.1,26.8,LASSEN_COUNTY,
4,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-05,,48.5,66.1,31.0,LASSEN_COUNTY,


In [None]:
# Collapse SD1 & SD2, LA1 & LA2
county_df['COUNTY'] = county_df['COUNTY'].str.replace(r'\.csv', '')
county_df.head()

### Drop unneeded data
AWND, TAVG & TMIN

In [14]:
#Drop columns
county_df = county_df.drop(['AWND','TAVG','TMIN',],1)
# county_df.tail()
county_df.count()

STATION      78163
NAME         78163
LATITUDE     78163
LONGITUDE    78163
DATE         78163
PRCP         59293
TMAX         43802
COUNTY       78163
dtype: int64

In [15]:
#Drop null data from PRCP
# county_df = county_df.dropna(subset = ['PRCP'])
# county_df.count()

## Calculate average for missing data

### Calculate PRCP averages

In [16]:
#Alt option
#Split NaNs into separate DF (keep index), then do .loc on COUNTY to replace prcp with the average
#Then merge back 

In [27]:
# Split the PRCP NaNs into seperate DF
county_prcp_nan_df = county_df[county_df['PRCP'].isna()]
county_prcp_nan_df

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,DATE,PRCP,TMAX,COUNTY
0,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-01,,41.1,LASSEN_COUNTY
1,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-02,,47.7,LASSEN_COUNTY
2,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-03,,53.1,LASSEN_COUNTY
3,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-04,,59.1,LASSEN_COUNTY
4,USR0000CBOG,"BOGARD RANGER STATION CALIFORNIA, CA US",40.5981,-121.0831,2013-05,,66.1,LASSEN_COUNTY
...,...,...,...,...,...,...,...,...
77978,USC00046144,"NEWARK, CA US",37.5147,-122.0325,2012-01,,59.9,Alameda
77979,USC00046144,"NEWARK, CA US",37.5147,-122.0325,2012-02,,62.1,Alameda
77980,USC00046144,"NEWARK, CA US",37.5147,-122.0325,2012-03,,62.0,Alameda
77981,USC00046144,"NEWARK, CA US",37.5147,-122.0325,2012-04,,67.1,Alameda


In [28]:
# Check to make sure PRCP = 0
county_prcp_nan_df.count()

STATION      18870
NAME         18870
LATITUDE     18870
LONGITUDE    18870
DATE         18870
PRCP             0
TMAX         18603
COUNTY       18870
dtype: int64

In [30]:
# Calculate the average precipitation by county
avg_df = county_df.groupby('COUNTY')['PRCP'].mean()
avg_df

COUNTY
Alameda                    1.566296
Alpine                     3.218148
Amador                     3.766632
Butte                      2.867806
Calaveras                  3.343754
Colusa                     1.211646
ContraCosta                1.821944
DelNorte                   7.408404
ElDorado                   3.462347
Fresno                     1.098750
Glenn                      1.509468
Humboldt                   4.900106
Imperial                   0.192384
Inyo                       0.405597
KERN_COUNTY                0.693266
KINGS_COUNTY               0.557091
LAKE_COUNTY                3.261282
LASSEN_COUNTY              1.619313
LOS_ANGELES_COUNTY         0.720127
LOS_ANGELES_COUNTY_2017    1.351986
MADERA_COUNTY              1.721277
MARIN__COUNTY              3.190233
MARIPOSA_COUNTY            2.600790
MENDOCINO_COUNTY           4.553711
MERCED_COUNTY              0.851996
MODOC_COUNTY               1.854977
MONO_COUNTY                2.000094
MONTEREY_COUNTY      

In [17]:
## Run mode at county level, then assign mode to NAN within that county
#Apply to PRCP & TMAX
# Set Nan to 0, then groupby COUNTY, fillforward

In [18]:
#Calculate the average precipitation by county
# avg_df = county_df.groupby('COUNTY')['PRCP'].mean()

In [19]:
#Reassign NAs with the average precipation by county
# county_df.groupby('COUNTY')['PRCP'].fillna(county_df.groupby('COUNTY')['PRCP'].mean())

#Try doing it with iloc instead of groupby, then do fillna
# Turn above cell into DF, then join
#Locate Sutter.csv in COUNTY column and set it equal to the avg prcp value


# county_df.loc[county_df.COUNTY.isin(avg_df)]
##Will this replace everything or just the NaNs?

In [20]:
# county_df['PRCP']

In [21]:
# county_df.head()

### Calculate TMAX averages

### Calculate monthly precipitation

## This won't work yet because the index has different # of records than the sum

In [22]:
#Calculate monthly precipitation (sum all stations per county)
# test_df['monthly_prcp'] = county_df.groupby(['COUNTY','DATE'])['PRCP'].sum()
# test_df

In [23]:
#Option 2
# county_df.assign(monthly_prcp=county_df.COUNTY.map(county_df.groupby(['COUNTY','DATE'])['PRCP'].sum()))


In [24]:
# #Drop all columns except COUNTY, date, prcp
# county_df_prcp = county_df_prcp.drop(['LATITUDE','LONGITUDE','TAVG','TMAX',
#                             'TMIN','AWND'],1)
# county_df_prcp = county_df_prcp.rename(columns={'PRCP': 'MONTHLY_PRCP'})
# county_df_prcp.tail()a

In [25]:
#Add monthly precipitation to county_df
# test_df = pd.merge(county_df, county_df_prcp,how='left', 
#                    left_on=['COUNTY','COUNTY'], right_on = ['DATE','DATE'])
# test_df.tail()

# Change wildfire data to by year, month, year-month; pull out time into separate column
# Export CSVs so everyone has the same data

In [26]:
# county_df.to_csv("Resources/county_clean.csv")
# wildfire_df.to_csv("Resources/wildfire_clean.csv")