# Data Collection and Cleanning

In this notebook, we are scraping all the data we need for our project. 

In [1]:
import requests, time, re
import pandas as pd
import datetime

from bs4 import BeautifulSoup

## Cleanin Up - Column Titles

In [2]:
def clean_title(title):
    title = title.lower()
    title = re.sub("[(.°)]", "", title)
    title = re.sub(" ", "_", title)
    
    return title

## Scraping the Data

Scrape the data from the website month-by-month and store it in `DataFrame`.

In [3]:
def get_data(url):  
    colmuns = []
    raw_data = {}
    date = ""
    months = {'jan': '1', 'feb': '2', 'mar': '3', 'apr': '4', 'may': '5', 'jun': '6',
                   'jul': '7', 'aug': '8', 'sep': '9', 'oct': '10', 'nov': '11', 'dec': '12'}
    
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    # Go to the table containg the daily weather history & observations
    for table in soup.find_all("table", {"id": "obsTable"}):
        
        # Parse the head of the table
        for thead in table.find_all("thead"):
            for tr in thead.find_all("tr"):
                # Get column title
                for i, th in enumerate(tr.find_all("th")):
                    if i == 0:
                        date = clean_title(th.text)
                        colmuns.append(clean_title("Date"))
                    elif i in range(1, 7):
                        colmuns.append(clean_title(th.text))
                        colmuns.append(clean_title(th.text))
                        if i == 6:
                            colmuns.append(clean_title("Wind Gust (mph)"))
                        else:
                            colmuns.append(clean_title(th.text))
                    else:
                        colmuns.append(clean_title(th.text))

        # Parse the body of the table
        for j, tbody in enumerate(table.find_all("tbody")):
            for tr in tbody.find_all("tr"):
                # Get column subtitle
                if j == 0:
                    for k, td in enumerate(tr.find_all("td")):
                        if k == 0:
                            date += "-" + months[td.text.lower()]
                        elif k != len(tr.find_all("td")) - 1:
                            colmuns[k] += "_" + td.text.lower()
                # Get data
                else:
                    for l, td in enumerate(tr.find_all("td")):
                        if colmuns[l] not in raw_data:
                            raw_data[colmuns[l]] = []

                        if l == 0:
                            raw_data[colmuns[l]].append("%s-%s" % (date, td.text))
                        elif l != len(tr.find_all("td")) - 1:    
                            raw_data[colmuns[l]].append(float(td.text.replace("\n", "").replace("-", "0")))
                        else:
                            raw_data[colmuns[l]].append(td.text.replace("\n", "").replace("\t", ""))
    
    return pd.DataFrame(raw_data)

## Cleaning the Data

Split the `date` column into separate `year`, `month` and `day` columns.

In [4]:
def clean_data(df):
    new_date = df['date'].apply(lambda x: pd.Series([i for i in x.split('/')]))    
    new_date.rename(columns={0:'year',1:'month',2:'day'},inplace=True)
    
    df.drop('date', axis=1, inplace=True)
    
    return pd.concat([new_date, df], axis=1)

Don't know if it will be a better idea to analyze the data in this notebook on an yearly bases and store the result in a notebook, or do this the next notebooks if/when we need it.

In [5]:
dfs = []

for year in range(2012, 2018):
    months = range(1, 7) if year == 2017 else range(1, 13)
    
    for month in months:
        df = get_data("https://www.wunderground.com/history/airport/KSBP/%d/%d/1/MonthlyHistory.html?req_city=San Luis Obispo&req_state=CA&req_statename=California&reqdb.zip=93405&reqdb.magic=1&reqdb.wmo=99999&MR=1" % (year, month))
        dfs.append(df)
        
        time.sleep(0.1)
        
data = pd.concat(dfs).reset_index(drop=True)

In [6]:
# convert date string to datetime object
data["date"] = pd.to_datetime(data["date"], format="%Y-%m-%d")

# get all data before June 12th, 2017 (aka before finals week)
data = data[data["date"] < datetime.datetime.strptime("2017-6-12", "%Y-%m-%d")]
data

Unnamed: 0,date,dew_point_f_avg,dew_point_f_high,dew_point_f_low,events,humidity_%_avg,humidity_%_high,humidity_%_low,precip_in_sum,sea_level_press_in_avg,...,sea_level_press_in_low,temp_f_avg,temp_f_high,temp_f_low,visibility_mi_avg,visibility_mi_high,visibility_mi_low,wind_gust_mph_high,wind_mph_avg,wind_mph_high
0,2012-01-01,44.0,50.0,34.0,Fog,80.0,100.0,25.0,0.00,30.15,...,30.08,56.0,73.0,39.0,6.0,10.0,0.0,0.0,1.0,8.0
1,2012-01-02,47.0,52.0,43.0,Fog,93.0,100.0,63.0,0.00,30.23,...,30.19,52.0,63.0,42.0,4.0,10.0,0.0,0.0,3.0,14.0
2,2012-01-03,43.0,50.0,37.0,Fog,85.0,100.0,32.0,0.01,30.24,...,30.17,58.0,77.0,39.0,6.0,10.0,0.0,0.0,2.0,10.0
3,2012-01-04,42.0,47.0,37.0,,69.0,96.0,33.0,0.00,30.24,...,30.20,56.0,73.0,39.0,10.0,10.0,8.0,0.0,1.0,9.0
4,2012-01-05,42.0,51.0,36.0,,66.0,93.0,23.0,0.00,30.15,...,30.09,60.0,78.0,42.0,10.0,10.0,7.0,22.0,4.0,18.0
5,2012-01-06,48.0,52.0,41.0,,84.0,96.0,62.0,0.01,30.07,...,30.01,54.0,61.0,46.0,7.0,10.0,4.0,25.0,9.0,21.0
6,2012-01-07,35.0,40.0,29.0,,39.0,57.0,26.0,0.00,30.07,...,30.02,62.0,71.0,52.0,10.0,10.0,10.0,26.0,7.0,15.0
7,2012-01-08,32.0,41.0,23.0,,49.0,73.0,16.0,0.00,30.16,...,30.10,56.0,72.0,39.0,10.0,10.0,10.0,0.0,2.0,7.0
8,2012-01-09,31.0,36.0,22.0,,52.0,76.0,15.0,0.00,30.18,...,30.15,54.0,73.0,36.0,10.0,10.0,10.0,0.0,2.0,7.0
9,2012-01-10,35.0,48.0,28.0,,69.0,89.0,30.0,0.00,30.13,...,30.09,50.0,66.0,33.0,10.0,10.0,10.0,0.0,2.0,10.0


In [7]:
data.to_csv("data/slo_weather_history.csv", index=False)