In [18]:
#Import dependencies
import pandas as pd
import glob
import os
import datetime
from sqlalchemy import create_engine

### Import & Cleanse Wildfire Data

In [2]:
#  Import and read the Wildfire CSV.
wildfire_df = pd.read_csv("Resources/California_Fire_Incidents.csv")
wildfire_df.head()

Unnamed: 0,AcresBurned,Active,AdminUnit,AirTankers,ArchiveYear,CalFireIncident,CanonicalUrl,ConditionStatement,ControlStatement,Counties,...,SearchKeywords,Started,Status,StructuresDamaged,StructuresDestroyed,StructuresEvacuated,StructuresThreatened,UniqueId,Updated,WaterTenders
0,257314.0,False,Stanislaus National Forest/Yosemite National Park,,2013,True,/incidents/2013/8/17/rim-fire/,,,Tuolumne,...,"Rim Fire, Stanislaus National Forest, Yosemite...",2013-08-17T15:25:00Z,Finalized,,,,,5fb18d4d-213f-4d83-a179-daaf11939e78,2013-09-06T18:30:00Z,
1,30274.0,False,USFS Angeles National Forest/Los Angeles Count...,,2013,True,/incidents/2013/5/30/powerhouse-fire/,,,Los Angeles,...,"Powerhouse Fire, May 2013, June 2013, Angeles ...",2013-05-30T15:28:00Z,Finalized,,,,,bf37805e-1cc2-4208-9972-753e47874c87,2013-06-08T18:30:00Z,
2,27531.0,False,CAL FIRE Riverside Unit / San Bernardino Natio...,,2013,True,/incidents/2013/7/15/mountain-fire/,,,Riverside,...,"Mountain Fire, July 2013, Highway 243, Highway...",2013-07-15T13:43:00Z,Finalized,,,,,a3149fec-4d48-427c-8b2c-59e8b79d59db,2013-07-30T18:00:00Z,
3,27440.0,False,Tahoe National Forest,,2013,False,/incidents/2013/8/10/american-fire/,,,Placer,...,"American Fire, August 2013, Deadwood Ridge, Fo...",2013-08-10T16:30:00Z,Finalized,,,,,8213f5c7-34fa-403b-a4bc-da2ace6e6625,2013-08-30T08:00:00Z,
4,24251.0,False,Ventura County Fire/CAL FIRE,,2013,True,/incidents/2013/5/2/springs-fire/,Acreage has been reduced based upon more accur...,,Ventura,...,"Springs Fire, May 2013, Highway 101, Camarillo...",2013-05-02T07:01:00Z,Finalized,6.0,10.0,,,46731fb8-3350-4920-bdf7-910ac0eb715c,2013-05-11T06:30:00Z,11.0


In [3]:
#Drop excess columns
wildfire_df = wildfire_df.drop(['Active','AdminUnit','AirTankers','CalFireIncident','CanonicalUrl',
                                'ConditionStatement','ControlStatement','Dozers','Engines','Extinguished',
                                'Featured','Final','FuelType','Helicopters','Injuries','Location',
                                'MajorIncident','PercentContained','Public','SearchDescription','SearchKeywords',
                                'Status','StructuresEvacuated','StructuresThreatened','Updated','WaterTenders'],1)
wildfire_df.head()

Unnamed: 0,AcresBurned,ArchiveYear,Counties,CountyIds,CrewsInvolved,Fatalities,Latitude,Longitude,Name,PersonnelInvolved,Started,StructuresDamaged,StructuresDestroyed,UniqueId
0,257314.0,2013,Tuolumne,55,,,37.857,-120.086,Rim Fire,,2013-08-17T15:25:00Z,,,5fb18d4d-213f-4d83-a179-daaf11939e78
1,30274.0,2013,Los Angeles,19,,,34.585595,-118.423176,Powerhouse Fire,,2013-05-30T15:28:00Z,,,bf37805e-1cc2-4208-9972-753e47874c87
2,27531.0,2013,Riverside,33,,,33.7095,-116.72885,Mountain Fire,,2013-07-15T13:43:00Z,,,a3149fec-4d48-427c-8b2c-59e8b79d59db
3,27440.0,2013,Placer,31,,,39.12,-120.65,American Fire,,2013-08-10T16:30:00Z,,,8213f5c7-34fa-403b-a4bc-da2ace6e6625
4,24251.0,2013,Ventura,56,47.0,,0.0,0.0,Springs Fire,2167.0,2013-05-02T07:01:00Z,6.0,10.0,46731fb8-3350-4920-bdf7-910ac0eb715c


In [4]:
#Confirm datatypes for each column
wildfire_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1636 entries, 0 to 1635
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   AcresBurned          1633 non-null   float64
 1   ArchiveYear          1636 non-null   int64  
 2   Counties             1636 non-null   object 
 3   CountyIds            1636 non-null   object 
 4   CrewsInvolved        171 non-null    float64
 5   Fatalities           21 non-null     float64
 6   Latitude             1636 non-null   float64
 7   Longitude            1636 non-null   float64
 8   Name                 1636 non-null   object 
 9   PersonnelInvolved    204 non-null    float64
 10  Started              1636 non-null   object 
 11  StructuresDamaged    67 non-null     float64
 12  StructuresDestroyed  175 non-null    float64
 13  UniqueId             1636 non-null   object 
dtypes: float64(8), int64(1), object(5)
memory usage: 179.1+ KB


In [5]:
#Drop duplicates based on UniqueID
wildfire_df.drop_duplicates("UniqueId", inplace = True)
wildfire_df.head()

Unnamed: 0,AcresBurned,ArchiveYear,Counties,CountyIds,CrewsInvolved,Fatalities,Latitude,Longitude,Name,PersonnelInvolved,Started,StructuresDamaged,StructuresDestroyed,UniqueId
0,257314.0,2013,Tuolumne,55,,,37.857,-120.086,Rim Fire,,2013-08-17T15:25:00Z,,,5fb18d4d-213f-4d83-a179-daaf11939e78
1,30274.0,2013,Los Angeles,19,,,34.585595,-118.423176,Powerhouse Fire,,2013-05-30T15:28:00Z,,,bf37805e-1cc2-4208-9972-753e47874c87
2,27531.0,2013,Riverside,33,,,33.7095,-116.72885,Mountain Fire,,2013-07-15T13:43:00Z,,,a3149fec-4d48-427c-8b2c-59e8b79d59db
3,27440.0,2013,Placer,31,,,39.12,-120.65,American Fire,,2013-08-10T16:30:00Z,,,8213f5c7-34fa-403b-a4bc-da2ace6e6625
4,24251.0,2013,Ventura,56,47.0,,0.0,0.0,Springs Fire,2167.0,2013-05-02T07:01:00Z,6.0,10.0,46731fb8-3350-4920-bdf7-910ac0eb715c


In [6]:
#Check for null data
wildfire_df.count()

AcresBurned            1606
ArchiveYear            1609
Counties               1609
CountyIds              1609
CrewsInvolved           165
Fatalities               13
Latitude               1609
Longitude              1609
Name                   1609
PersonnelInvolved       197
Started                1609
StructuresDamaged        60
StructuresDestroyed     158
UniqueId               1609
dtype: int64

In [7]:
#Drop null data from AcresBurned
wildfire_df = wildfire_df.dropna(subset = ['AcresBurned'])
wildfire_df.head()

Unnamed: 0,AcresBurned,ArchiveYear,Counties,CountyIds,CrewsInvolved,Fatalities,Latitude,Longitude,Name,PersonnelInvolved,Started,StructuresDamaged,StructuresDestroyed,UniqueId
0,257314.0,2013,Tuolumne,55,,,37.857,-120.086,Rim Fire,,2013-08-17T15:25:00Z,,,5fb18d4d-213f-4d83-a179-daaf11939e78
1,30274.0,2013,Los Angeles,19,,,34.585595,-118.423176,Powerhouse Fire,,2013-05-30T15:28:00Z,,,bf37805e-1cc2-4208-9972-753e47874c87
2,27531.0,2013,Riverside,33,,,33.7095,-116.72885,Mountain Fire,,2013-07-15T13:43:00Z,,,a3149fec-4d48-427c-8b2c-59e8b79d59db
3,27440.0,2013,Placer,31,,,39.12,-120.65,American Fire,,2013-08-10T16:30:00Z,,,8213f5c7-34fa-403b-a4bc-da2ace6e6625
4,24251.0,2013,Ventura,56,47.0,,0.0,0.0,Springs Fire,2167.0,2013-05-02T07:01:00Z,6.0,10.0,46731fb8-3350-4920-bdf7-910ac0eb715c


In [8]:
#Check for null data
wildfire_df.count()

AcresBurned            1606
ArchiveYear            1606
Counties               1606
CountyIds              1606
CrewsInvolved           165
Fatalities               13
Latitude               1606
Longitude              1606
Name                   1606
PersonnelInvolved       197
Started                1606
StructuresDamaged        60
StructuresDestroyed     158
UniqueId               1606
dtype: int64

In [9]:
# Convert started to datetime
wildfire_df['Started']= pd.to_datetime(wildfire_df['Started'])
wildfire_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1606 entries, 0 to 1632
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   AcresBurned          1606 non-null   float64            
 1   ArchiveYear          1606 non-null   int64              
 2   Counties             1606 non-null   object             
 3   CountyIds            1606 non-null   object             
 4   CrewsInvolved        165 non-null    float64            
 5   Fatalities           13 non-null     float64            
 6   Latitude             1606 non-null   float64            
 7   Longitude            1606 non-null   float64            
 8   Name                 1606 non-null   object             
 9   PersonnelInvolved    197 non-null    float64            
 10  Started              1606 non-null   datetime64[ns, UTC]
 11  StructuresDamaged    60 non-null     float64            
 12  StructuresDestroyed 

In [10]:
# Split Started timestamp into year, month and year-month
wildfire_df['Year'] = wildfire_df['Started'].dt.year
wildfire_df['Month'] = wildfire_df['Started'].dt.month.map("{:02}".format)

cols=["Year","Month"]
wildfire_df['Year-Month'] = wildfire_df[cols].apply(lambda x: '-'.join(x.values.astype(str)), axis="columns")

wildfire_df.head()

Unnamed: 0,AcresBurned,ArchiveYear,Counties,CountyIds,CrewsInvolved,Fatalities,Latitude,Longitude,Name,PersonnelInvolved,Started,StructuresDamaged,StructuresDestroyed,UniqueId,Year,Month,Year-Month
0,257314.0,2013,Tuolumne,55,,,37.857,-120.086,Rim Fire,,2013-08-17 15:25:00+00:00,,,5fb18d4d-213f-4d83-a179-daaf11939e78,2013,8,2013-08
1,30274.0,2013,Los Angeles,19,,,34.585595,-118.423176,Powerhouse Fire,,2013-05-30 15:28:00+00:00,,,bf37805e-1cc2-4208-9972-753e47874c87,2013,5,2013-05
2,27531.0,2013,Riverside,33,,,33.7095,-116.72885,Mountain Fire,,2013-07-15 13:43:00+00:00,,,a3149fec-4d48-427c-8b2c-59e8b79d59db,2013,7,2013-07
3,27440.0,2013,Placer,31,,,39.12,-120.65,American Fire,,2013-08-10 16:30:00+00:00,,,8213f5c7-34fa-403b-a4bc-da2ace6e6625,2013,8,2013-08
4,24251.0,2013,Ventura,56,47.0,,0.0,0.0,Springs Fire,2167.0,2013-05-02 07:01:00+00:00,6.0,10.0,46731fb8-3350-4920-bdf7-910ac0eb715c,2013,5,2013-05


### Export CSV

In [11]:
wildfire_df.to_csv("Resources/wildfire_clean.csv")