# Webscraping NYC Weather Data from WeatherUnderground

<b> Import Required Libraries </b>

In [38]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date, timedelta
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

<b> Initialize the list with all the dates that need to be scraped from website </b>

In [39]:
#Set desired dates to scrape data
sdate = date(2021,11,15)
edate = date(2023,12,1)
dates_dt = pd.date_range(sdate,edate-timedelta(days=1),freq='d')
dates = []
for i in dates_dt:
    dates.append(i.strftime('%Y-%m-%d'))

<b> A function to create an empty dictionary with required fields <b>

In [40]:
def get_dict():
    dict_weather = {'date':'',
        'time':'',
        'temperature':'',
        'dew_point':'',
        'humidity':'',
        'wind':'',
        'wind_speed':'',
        'wind_gust':'',
        'pressure':'',
        'precip':'',
        'condition':''}
    
    return dict_weather

<b> Initializing the Selenium Web Driver </b>

In [41]:
service = Service(executable_path=ChromeDriverManager().install())

options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')


In [42]:
driver = webdriver.Chrome(service= service, options=options)

<h3><b> Scraping using Beautiful Soup </b></h3>


**Beautiful Soup**
1. It is a Python library that is used for web scraping purposes to scrape the data out of HTML and XML files.
2. It creates a parse tree from page source code that can be used to extract data in a hierarchical and more readable manner.

**Procedure:-**

1. Scrape HTML Content from the weblink using Selenium Driver
2. Parse HTML Code with Beautiful Soup
3. With the help of Beautiful Soup find the required table using HTML tags 
4. Extract Text from the table
5. Save it in a temporary dictionary initialized with above defined function
6. Add the dictionary to a list and return list


In [44]:
def get_weather(url, date):
    global driver
    driver.get(url)
    time.sleep(5)
    page = driver.page_source
    soup = BeautifulSoup(page, "html.parser")
    res = soup.find("table", {'class':'mat-table cdk-table mat-sort ng-star-inserted'})
    one_day_list = []
    try:
        for tr in res.find_all('tr')[1:]:
            tds = tr.find_all('td')
            dict_weather = get_dict()
            try:
                dict_weather['date'] = date
                dict_weather['time'] = tds[0].find('span').get_text()
                dict_weather['temperature'] = tds[1].find('span').get_text()
                dict_weather['dew_point'] = tds[2].find('span').get_text()
                dict_weather['humidity'] = tds[3].find('span').get_text()
                dict_weather['wind'] = tds[4].find('span').get_text()
                dict_weather['wind_speed'] = tds[5].find('span').get_text()
                dict_weather['wind_gust'] = tds[6].find('span').get_text()
                dict_weather['pressure'] = tds[7].find('span').get_text()
                dict_weather['precip'] = tds[8].find('span').get_text()
                dict_weather['condition'] = tds[9].find('span').get_text()

                one_day_list.append(dict_weather)
            except:
                break
    except:
        pass
            
    return one_day_list

<b> Steps for getting weather for each date and saving it in a list </b>

In [45]:
final_list = []
i = 1
for d in dates:
    print(i)
    i+=1
    url = 'https://www.wunderground.com/history/daily/us/ny/new-york-city/KLGA/date/'+d
    temp = get_weather(url, d)
    if len(temp) < 12:
        temp = get_weather(url, d)
    final_list.extend(temp)
    print(d)

1
2021-11-15
2
2021-11-16
3
2021-11-17
4
2021-11-18
5
2021-11-19
6
2021-11-20
7
2021-11-21
8
2021-11-22
9
2021-11-23
10
2021-11-24
11
2021-11-25
12
2021-11-26
13
2021-11-27
14
2021-11-28
15
2021-11-29
16
2021-11-30
17
2021-12-01
18
2021-12-02
19
2021-12-03
20
2021-12-04
21
2021-12-05
22
2021-12-06
23
2021-12-07
24
2021-12-08
25
2021-12-09
26
2021-12-10
27
2021-12-11
28
2021-12-12
29
2021-12-13
30
2021-12-14
31
2021-12-15
32
2021-12-16
33
2021-12-17
34
2021-12-18
35
2021-12-19
36
2021-12-20
37
2021-12-21
38
2021-12-22
39
2021-12-23
40
2021-12-24
41
2021-12-25
42
2021-12-26
43
2021-12-27
44
2021-12-28
45
2021-12-29
46
2021-12-30
47
2021-12-31
48
2022-01-01
49
2022-01-02
50
2022-01-03
51
2022-01-04
52
2022-01-05
53
2022-01-06
54
2022-01-07
55
2022-01-08
56
2022-01-09
57
2022-01-10
58
2022-01-11
59
2022-01-12
60
2022-01-13
61
2022-01-14
62
2022-01-15
63
2022-01-16
64
2022-01-17
65
2022-01-18
66
2022-01-19
67
2022-01-20
68
2022-01-21
69
2022-01-22
70
2022-01-23
71
2022-01-24
72
2022-01-25
7

<b> Converting the list of dictionary to a dataframe </b>

In [47]:
df_weather_missing = pd.DataFrame.from_dict(final_list, orient='columns')

In [48]:
df_weather_missing

Unnamed: 0,date,time,temperature,dew_point,humidity,wind,wind_speed,wind_gust,pressure,precip,condition
0,2021-11-15,12:51 AM,45 °F,34 °F,65 °%,WSW,10 °mph,0 °mph,29.76 °in,0.0 °in,Cloudy
1,2021-11-15,1:51 AM,44 °F,34 °F,68 °%,W,3 °mph,0 °mph,29.75 °in,0.0 °in,Cloudy
2,2021-11-15,2:51 AM,43 °F,34 °F,71 °%,WSW,5 °mph,0 °mph,29.74 °in,0.0 °in,Mostly Cloudy
3,2021-11-15,3:51 AM,43 °F,34 °F,71 °%,SW,7 °mph,0 °mph,29.72 °in,0.0 °in,Fair
4,2021-11-15,4:51 AM,42 °F,35 °F,76 °%,SW,6 °mph,0 °mph,29.73 °in,0.0 °in,Partly Cloudy
...,...,...,...,...,...,...,...,...,...,...,...
20741,2023-11-30,7:51 PM,49 °F,27 °F,43 °%,SW,8 °mph,0 °mph,30.11 °in,0.0 °in,Mostly Cloudy
20742,2023-11-30,8:51 PM,48 °F,28 °F,46 °%,SSW,8 °mph,0 °mph,30.11 °in,0.0 °in,Mostly Cloudy
20743,2023-11-30,9:51 PM,47 °F,29 °F,50 °%,SW,9 °mph,0 °mph,30.12 °in,0.0 °in,Mostly Cloudy
20744,2023-11-30,10:51 PM,47 °F,28 °F,48 °%,SW,7 °mph,17 °mph,30.13 °in,0.0 °in,Partly Cloudy


<b> Saving the dataframe to a CSV file so that it can be preprocessed and joined with collision dataset

In [49]:
df_weather_missing.to_csv('weather_data_nov2021_2023.csv', mode='a', index = False, header=None)

In [50]:
driver.quit()