# Notable Fires Web Scrape

### OBJECTIVE

- This notebook will scrape California fire information from the wikipedia url listed below.

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import requests
from bs4 import BeautifulSoup

import selenium
import regex as re

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_California_wildfires'

In [3]:
res = requests.get(url)

In [4]:
soup = BeautifulSoup(res.content)

In [5]:
tables = soup.find_all('table', class_= 'wikitable')

In [6]:
main_header = [th.text.lower().strip().replace(' ','_') for th in tables[4].find_all('th')]
main_header.append('table')
main_header

['name',
 'county',
 'acres',
 'hectares',
 'start',
 'contained',
 'notes',
 'ref',
 'table']

In [7]:
notable_fires = []
for i, table in enumerate(tables):
    table_df = {}
    headers = {th.text.lower().strip().replace(' ','_'):index for index,th in enumerate(tables[4].find_all('th'))}

    for row in table.find_all('tr')[1:]:
        notable_f = {}

        for j,feature in enumerate(main_header):
            try:
                notable_f[feature] = row.find_all('td')[headers[feature]].text.strip()
            except:
                notable_f[feature] = np.nan
                
        notable_f['table'] = i
        notable_fires.append(notable_f)
pd.DataFrame(notable_fires).to_csv('./Data/notable_fires_data.csv',index=False)

In [8]:
df = pd.read_csv('./Data/notable_fires_data.csv')

In [9]:
notable_fires = df.loc[df['table'] == 4]

In [10]:
notable_fires.head()

Unnamed: 0,name,county,acres,hectares,start,contained,notes,ref,table
80,Rumsey,Yolo,39138,15838.6,"October 10, 2004","October 16, 2004",5 structures destroyed,[49],4
81,Old,San Bernardino,91281,36940.1,"October 21, 2003","November 25, 2003",975 structures destroyed,[50],4
82,Simi,Ventura,108204,43788.6,"October 25, 2003","November 5, 2003",315 structures destroyed,[51],4
83,Topanga,Los Angeles,24175,9783.3,"September 28, 2005","October 6, 2005",,[52],4
84,Esperanza,Riverside,41173,16662.1,"October 26, 2006","November 1, 2006","5 fatalities, 54 structures destroyed",[53][circular reference],4


In [19]:
for col in notable_fires.columns:
    print(col)
    notable_fires[col] = notable_fires[col].map(lambda x: re.sub(pattern='(\[\d+\])+$',repl='',string=x))

name
county
acres
hectares
start
contained
notes


In [15]:
notable_fires['hectares'] = notable_fires['hectares'].str.replace(",","")
notable_fires['start'] = notable_fires['start'].str.replace(",","")
notable_fires['contained'] = notable_fires['contained'].str.replace(",","")
notable_fires['acres'] = notable_fires['acres'].str.replace(",","")
notable_fires['county'] = notable_fires['county'].str.replace(",","")
notable_fires = notable_fires.replace(np.nan, '', regex=True)
notable_fires = notable_fires.drop(['table'], axis = 1)
notable_fires = notable_fires.drop(['ref'], axis = 1)

In [20]:
notable_fires.head()

Unnamed: 0,name,county,acres,hectares,start,contained,notes
80,Rumsey,Yolo,39138,15838.6,October 10 2004,October 16 2004,5 structures destroyed
81,Old,San Bernardino,91281,36940.1,October 21 2003,November 25 2003,975 structures destroyed
82,Simi,Ventura,108204,43788.6,October 25 2003,November 5 2003,315 structures destroyed
83,Topanga,Los Angeles,24175,9783.3,September 28 2005,October 6 2005,
84,Esperanza,Riverside,41173,16662.1,October 26 2006,November 1 2006,"5 fatalities, 54 structures destroyed"


In [17]:
notable_fires.to_csv('../data/notable_fires_data.csv',index=False)

### SUMMARY

- This notebook scrapes notable California fire information from a wikipedia page. The columns include the name of the fire, county, acres, hectares, start, contained, and any notes relative to the fire.