# Data Collection and Cleanning

In this notebook, we are scraping all the data we need for our project. 

In [1]:
import requests, time, re
import pandas as pd

from bs4 import BeautifulSoup

## Cleanin Up - Column Titles

In [2]:
def clean_title(title):
    title = title.lower()
    title = re.sub("[(.°)]", "", title)
    title = re.sub(" ", "_", title)
    
    return title

## Scraping the Data

Scrape the data from the website month-by-month and store it in `DataFrame`.

In [3]:
def get_data(url):  
    colmuns = []
    raw_data = {}
    date = ""
    
    req = requests.get(url)
    soup = BeautifulSoup(req.text, "html.parser")

    # Go to the table containg the daily weather history & observations
    for table in soup.find_all("table", {"id": "obsTable"}):
        
        # Parse the head of the table
        for thead in table.find_all("thead"):
            for tr in thead.find_all("tr"):
                # Get column title
                for i, th in enumerate(tr.find_all("th")):
                    if i == 0:
                        date = clean_title(th.text)
                        colmuns.append(clean_title("Date"))
                    elif i in range(1, 7):
                        colmuns.append(clean_title(th.text))
                        colmuns.append(clean_title(th.text))
                        if i == 6:
                            colmuns.append(clean_title("Wind Gust (mph)"))
                        else:
                            colmuns.append(clean_title(th.text))
                    else:
                        colmuns.append(clean_title(th.text))

        # Parse the body of the table
        for j, tbody in enumerate(table.find_all("tbody")):
            for tr in tbody.find_all("tr"):
                # Get column subtitle
                if j == 0:
                    for k, td in enumerate(tr.find_all("td")):
                        if k == 0:
                            date += "/" + td.text.lower()
                        elif k != len(tr.find_all("td")) - 1:
                            colmuns[k] += "_" + td.text.lower()
                # Get data
                else:
                    for l, td in enumerate(tr.find_all("td")):
                        if colmuns[l] not in raw_data:
                            raw_data[colmuns[l]] = []

                        if l == 0:
                            raw_data[colmuns[l]].append("%s/%s" % (date, td.text))
                        elif l != len(tr.find_all("td")) - 1:    
                            raw_data[colmuns[l]].append(float(td.text.replace("\n", "").replace("-", "0")))
                        else:
                            raw_data[colmuns[l]].append(td.text.replace("\n", "").replace("\t", ""))
    
    return pd.DataFrame(raw_data)

## Cleaning the Data

Split the `date` column into separate `year`, `month` and `day` columns.

In [4]:
def clean_data(df):
    new_date = df['date'].apply(lambda x: pd.Series([i for i in x.split('/')]))    
    new_date.rename(columns={0:'year',1:'month',2:'day'},inplace=True)
    
    df.drop('date', axis=1, inplace=True)
    
    return pd.concat([new_date, df], axis=1)

Don't know if it will be a better idea to analyze the data in this notebook on an yearly bases and store the result in a notebook, or do this the next notebooks if/when we need it.

In [5]:
for year in range(2013, 2018):
    months = range(1, 6) if year == 2017 else range(1, 13)
    
    for month in months:
        data = get_data("https://www.wunderground.com/history/airport/KSBP/%d/%d/1/MonthlyHistory.html?req_city=San Luis Obispo&req_state=CA&req_statename=California&reqdb.zip=93405&reqdb.magic=1&reqdb.wmo=99999&MR=1" % (year, month))
        data = clean_data(data)
        
        data.to_csv("data/%d_%d.csv" % (year, month), index=False)
        
        time.sleep(0.1)
    
#     dfs = []
#     for month in months:
#         dfs.append(pd.read_csv("data/%d_%d.csv" % (year, month)))
        
#     yearly_data = pd.concat(dfs)