In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from collections import OrderedDict
import pickle
from bs4 import BeautifulSoup
import requests
import time, os
from selenium import webdriver
import random

In [2]:
#create list of desired months to scrape
dates = ["2008-01", "2021-01"]
start, end = [datetime.strptime(_, '%Y-%m') for _ in dates]
month_ordereddict = OrderedDict(((start + timedelta(_)).strftime("%Y-%-m"), None) for _ in range((end - start).days)).keys()
month_list = list(month_ordereddict)

In [3]:
#scrapping weather data for five large cities in state - all in different geographical areas
#url string for each city
kenner = "KMSY/date/"
baton_rouge = "KBTR/date/"
lake_charles = "KLCH/date/"
alexandria = "KAEX/date/"
shreveport = "KSHV/date/"

In [4]:
cities = [kenner, baton_rouge, lake_charles, alexandria, shreveport]

In [5]:
weather_site_base_url = "https://www.wunderground.com/history/monthly/"

In [6]:
#append string for city url
url_cities = []
for city in cities:
    url_cities.append(weather_site_base_url + city)

In [7]:
#append string for month url
urls = []
for url_city in url_cities:
    for month in month_list:
        urls.append(url_city + month)

In [8]:
#create headers for resulting table, post-scrape
results_header_row_vals = [
    "ID",
    "max_temp_deg_f",
    "avg_temp_deg_f",
    "min_temp_deg_f",
    "max_dew_point_deg_f",
    "avg_dew_point_deg_f",
    "min_dew_point_deg_f",
    "max_dew_humidity_pct",
    "avg_dew_humidity_pct",
    "min_dew_humidity_pct",
    "max_wind_speed_mph",
    "avg_wind_speed_mph",
    "min_wind_speed_mph",
    "max_pressure_hg",
    "avg_pressure_hg",
    "min_pressure_hg",
    "tot_precipitation_in",
]

In [9]:
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [10]:
driver = webdriver.Chrome(chromedriver)

In [12]:
#perform scrape for each city, scraped seperately to avoid losing data if error raised
kenner_urls = []
for url in urls:
    if kenner in url:
        kenner_urls.append(url)

kenner_list = []

for url in kenner_urls:

    driver.get(url)
    
    time.sleep(.5+2*random.random())

    monthly_data_page_html_text = driver.page_source
    soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

    html_table_container = soup.find(attrs={"class": "observation-table"})
    
    
    try: #troubleshooting attributeerror occasionally raised
        html_table = html_table_container.findChild("table")
    except AttributeError:
        driver.get(url)
    
        time.sleep(.5+2*random.random())

        monthly_data_page_html_text = driver.page_source
        soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

        html_table_container = soup.find(attrs={"class": "observation-table"})
        
        html_table = html_table_container.findChild("table")
    
    
    html_table_body = html_table.findChild("tbody")

    html_table_columns = html_table_body.findChild("tr").findChildren("td", recursive=False) 

    num_rows = len(html_table_columns[0].find_all("tr"))
    row_index = 1

    while row_index < num_rows:

        new_row = []

        for html_column in html_table_columns:

            html_column_rows = html_column.find_all("tr")
            html_current_column_row = html_column_rows[row_index]

            html_current_column_cells = html_current_column_row.findChildren("td", recursive=False)

            for html_cell in html_current_column_cells:

                is_first_cell_in_row = len(new_row) == 0

                html_cell_content = html_cell.contents[0].strip()

                if is_first_cell_in_row:
                    result_cell_content = '_'.join(url.rsplit('/', 3)[1::2])
                else:
                    result_cell_content = html_cell_content

                new_row.append(result_cell_content)

        kenner_list.append(new_row)

        row_index += 1

kenner_scrape = pd.DataFrame(kenner_list, columns=results_header_row_vals)
kenner_scrape.to_pickle('kenner_scrape.pkl')

In [13]:
#perform scrape for each city, scraped seperately to avoid losing data if error raised
baton_rouge_urls = []
for url in urls:
    if baton_rouge in url:
        baton_rouge_urls.append(url)

baton_rouge_list = []

for url in baton_rouge_urls:

    driver.get(url)
    
    time.sleep(.5+2*random.random())

    monthly_data_page_html_text = driver.page_source
    soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

    html_table_container = soup.find(attrs={"class": "observation-table"})
    
    
    try: #troubleshooting attributeerror occasionally raised
        html_table = html_table_container.findChild("table")
    except AttributeError:
        driver.get(url)
    
        time.sleep(.5+2*random.random())

        monthly_data_page_html_text = driver.page_source
        soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

        html_table_container = soup.find(attrs={"class": "observation-table"})
        
        html_table = html_table_container.findChild("table")
    
    
    html_table_body = html_table.findChild("tbody")

    html_table_columns = html_table_body.findChild("tr").findChildren("td", recursive=False) 

    num_rows = len(html_table_columns[0].find_all("tr"))
    row_index = 1

    while row_index < num_rows:

        new_row = []

        for html_column in html_table_columns:

            html_column_rows = html_column.find_all("tr")
            html_current_column_row = html_column_rows[row_index]

            html_current_column_cells = html_current_column_row.findChildren("td", recursive=False)

            for html_cell in html_current_column_cells:

                is_first_cell_in_row = len(new_row) == 0

                html_cell_content = html_cell.contents[0].strip()

                if is_first_cell_in_row:
                    result_cell_content = '_'.join(url.rsplit('/', 3)[1::2])
                else:
                    result_cell_content = html_cell_content

                new_row.append(result_cell_content)

        baton_rouge_list.append(new_row)

        row_index += 1

baton_rouge_scrape = pd.DataFrame(baton_rouge_list, columns=results_header_row_vals)

baton_rouge_scrape.to_pickle('baton_rouge_scrape.pkl')

In [15]:
#perform scrape for each city, scraped seperately to avoid losing data if error raised
lake_charles_urls = []
for url in urls:
    if lake_charles in url:
        lake_charles_urls.append(url)

lake_charles_list = []

for url in lake_charles_urls:

    driver.get(url)
    
    time.sleep(.5+2*random.random())

    monthly_data_page_html_text = driver.page_source
    soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

    html_table_container = soup.find(attrs={"class": "observation-table"})
    
    
    try: #troubleshooting attributeerror occasionally raised
        html_table = html_table_container.findChild("table")
    except AttributeError:
        driver.get(url)
    
        time.sleep(.5+2*random.random())

        monthly_data_page_html_text = driver.page_source
        soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

        html_table_container = soup.find(attrs={"class": "observation-table"})
        
        html_table = html_table_container.findChild("table")
    
    
    html_table_body = html_table.findChild("tbody")

    html_table_columns = html_table_body.findChild("tr").findChildren("td", recursive=False) 

    num_rows = len(html_table_columns[0].find_all("tr"))
    row_index = 1

    while row_index < num_rows:

        new_row = []

        for html_column in html_table_columns:

            html_column_rows = html_column.find_all("tr")
            html_current_column_row = html_column_rows[row_index]

            html_current_column_cells = html_current_column_row.findChildren("td", recursive=False)

            for html_cell in html_current_column_cells:

                is_first_cell_in_row = len(new_row) == 0

                html_cell_content = html_cell.contents[0].strip()

                if is_first_cell_in_row:
                    result_cell_content = '_'.join(url.rsplit('/', 3)[1::2])
                else:
                    result_cell_content = html_cell_content

                new_row.append(result_cell_content)

        lake_charles_list.append(new_row)

        row_index += 1

lake_charles_scrape = pd.DataFrame(lake_charles_list, columns=results_header_row_vals)

lake_charles_scrape.to_pickle('lake_charles_scrape.pkl')

In [12]:
#perform scrape for each city, scraped seperately to avoid losing data if error raised
alexandria_urls = [] 
for url in urls:
    if alexandria in url:
        alexandria_urls.append(url)

alexandria_list = []

for url in alexandria_urls:

    driver.get(url)
    
    time.sleep(.5+2*random.random())

    monthly_data_page_html_text = driver.page_source
    soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

    html_table_container = soup.find(attrs={"class": "observation-table"})
    
    
    try: #troubleshooting attributeerror occasionally raised
        html_table = html_table_container.findChild("table")
    except AttributeError:
        driver.get(url)
    
        time.sleep(.5+2*random.random())

        monthly_data_page_html_text = driver.page_source
        soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

        html_table_container = soup.find(attrs={"class": "observation-table"})
        
        html_table = html_table_container.findChild("table")
    
    
    html_table_body = html_table.findChild("tbody")

    html_table_columns = html_table_body.findChild("tr").findChildren("td", recursive=False) 

    num_rows = len(html_table_columns[0].find_all("tr"))
    row_index = 1

    while row_index < num_rows:

        new_row = []

        for html_column in html_table_columns:

            html_column_rows = html_column.find_all("tr")
            html_current_column_row = html_column_rows[row_index]

            html_current_column_cells = html_current_column_row.findChildren("td", recursive=False)

            for html_cell in html_current_column_cells:

                is_first_cell_in_row = len(new_row) == 0

                html_cell_content = html_cell.contents[0].strip()

                if is_first_cell_in_row:
                    result_cell_content = '_'.join(url.rsplit('/', 3)[1::2])
                else:
                    result_cell_content = html_cell_content

                new_row.append(result_cell_content)

        alexandria_list.append(new_row)

        row_index += 1

alexandria_scrape = pd.DataFrame(alexandria_list, columns=results_header_row_vals)

alexandria_scrape.to_pickle('alexandria_scrape.pkl')

In [13]:
#perform scrape for each city, scraped seperately to avoid losing data if error raised
shreveport_urls = []
for url in urls:
    if shreveport in url:
        shreveport_urls.append(url)

shreveport_list = []

for url in shreveport_urls:

    driver.get(url)
    
    time.sleep(.5+2*random.random())

    monthly_data_page_html_text = driver.page_source
    soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

    html_table_container = soup.find(attrs={"class": "observation-table"})
    
    
    try: #troubleshooting attributeerror occasionally raised
        html_table = html_table_container.findChild("table")
    except AttributeError:
        driver.get(url)
    
        time.sleep(.5+2*random.random())

        monthly_data_page_html_text = driver.page_source
        soup = BeautifulSoup(monthly_data_page_html_text, 'html.parser')

        html_table_container = soup.find(attrs={"class": "observation-table"})
        
        html_table = html_table_container.findChild("table")
    
    
    html_table_body = html_table.findChild("tbody")

    html_table_columns = html_table_body.findChild("tr").findChildren("td", recursive=False) 

    num_rows = len(html_table_columns[0].find_all("tr"))
    row_index = 1

    while row_index < num_rows:

        new_row = []

        for html_column in html_table_columns:

            html_column_rows = html_column.find_all("tr")
            html_current_column_row = html_column_rows[row_index]

            html_current_column_cells = html_current_column_row.findChildren("td", recursive=False)

            for html_cell in html_current_column_cells:

                is_first_cell_in_row = len(new_row) == 0

                html_cell_content = html_cell.contents[0].strip()

                if is_first_cell_in_row:
                    result_cell_content = '_'.join(url.rsplit('/', 3)[1::2])
                else:
                    result_cell_content = html_cell_content

                new_row.append(result_cell_content)

        shreveport_list.append(new_row)

        row_index += 1

shreveport_scrape = pd.DataFrame(shreveport_list, columns=results_header_row_vals)

shreveport_scrape.to_pickle('shreveport_scrape.pkl')

In [14]:
driver.quit()