<a href="https://colab.research.google.com/github/eniolaoduntan/Web-Scraping/blob/main/Web_Scraping_and_Preprocessing_Weather_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
from bs4 import BeautifulSoup #for parsing html
import requests
import csv
import pandas as pd
import matplotlib.pyplot as plt #for visualisation

In [2]:
#Input url as a variable
url = "https://www.accuweather.com/en/us/deer-park/11729/daily-weather-forecast/2102238"

In [6]:
#To work around websites blocked from scraping, we can rename the header using headers= {'user-agent':'hss'})
response = requests.get(url, headers= {'user-agent':'hss'})

In [None]:
response.content

In [8]:
#specify html parser usin beautifulsoup and store as variable "soup"
soup = BeautifulSoup(response.content, "html.parser")

In [None]:
print(soup.prettify())

In [10]:
#inspect element responsible for date and copy attribute, passing it as variable "date"
date= soup.find_all('span', class_ = "module-header sub date")

In [12]:
#extract date (text values only) using a loop function
date = [i.text for i in date]

In [13]:
date

['7/17',
 '7/18',
 '7/19',
 '7/20',
 '7/21',
 '7/22',
 '7/23',
 '7/24',
 '7/25',
 '7/26',
 '7/27',
 '7/28',
 '7/29',
 '7/30',
 '7/31',
 '8/1',
 '8/2',
 '8/3',
 '8/4',
 '8/5',
 '8/6',
 '8/7',
 '8/8',
 '8/9',
 '8/10',
 '8/11',
 '8/12',
 '8/13',
 '8/14',
 '8/15',
 '8/16',
 '8/17',
 '8/18',
 '8/19',
 '8/20',
 '8/21',
 '8/22',
 '8/23',
 '8/24',
 '8/25',
 '8/26',
 '8/27',
 '8/28',
 '8/29',
 '8/30']

In [14]:
#Extract Highest temperature by inspecting element and expanding on temperature
high_temp = soup.find_all('span', class_ = "high")

In [15]:
#extract high temperature (text values only) using a loop function
high_temp = [i.text for i in high_temp]

In [None]:
high_temp

In [17]:
#Extract lowest temperature by inspecting element and expanding on temperature
low_temp = soup.find_all('span', class_ = "low")

In [18]:
#extract low temperature (text values only) using a loop function
low_temp = [i.text for i in low_temp]

In [None]:
low_temp

In [20]:
#Extract water drop by inspecting element (precipitation) and expanding
waterdrop = soup.find_all('div', class_ = "precip") #has a tag of div and a class precip

In [21]:
#Extract text values
waterdrop = [i.text for i in waterdrop]

In [22]:
waterdrop

['\n\n\t\t87%\n\t',
 '\n\n\t\t65%\n\t',
 '\n\n\t\t4%\n\t',
 '\n\n\t\t13%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t59%\n\t',
 '\n\n\t\t59%\n\t',
 '\n\n\t\t61%\n\t',
 '\n\n\t\t63%\n\t',
 '\n\n\t\t1%\n\t',
 '\n\n\t\t3%\n\t',
 '\n\n\t\t68%\n\t',
 '\n\n\t\t10%\n\t',
 '\n\n\t\t20%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t20%\n\t',
 '\n\n\t\t57%\n\t',
 '\n\n\t\t62%\n\t',
 '\n\n\t\t5%\n\t',
 '\n\n\t\t5%\n\t',
 '\n\n\t\t20%\n\t',
 '\n\n\t\t39%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t10%\n\t',
 '\n\n\t\t20%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t57%\n\t',
 '\n\n\t\t55%\n\t',
 '\n\n\t\t40%\n\t',
 '\n\n\t\t65%\n\t',
 '\n\n\t\t59%\n\t',
 '\n\n\t\t59%\n\t',
 '\n\n\t\t69%\n\t',
 '\n\n\t\t10%\n\t',
 '\n\n\t\t20%\n\t',
 '\n\n\t\t59%\n\t',
 '\n\n\t\t55%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t25%\n\t',
 '\n\n\t\t8%\n\t',
 '\n\n\t\t67%\n\t']

In [24]:
#Group lists into and pass into dataframe using pandas and assign it to variable "data"
data= pd.DataFrame({"date":date, "high_temp": high_temp, "low_temp":low_temp, "waterdrop":waterdrop})

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date       45 non-null     object
 1   high_temp  45 non-null     object
 2   low_temp   45 non-null     object
 3   waterdrop  45 non-null     object
dtypes: object(4)
memory usage: 1.5+ KB


In [26]:
data.describe()

Unnamed: 0,date,high_temp,low_temp,waterdrop
count,45,45,45,45
unique,45,12,11,22
top,7/17,85°,/69°,\n\n\t\t25%\n\t
freq,1,9,9,10


In [29]:
#Clean data
#Date column to "date" data type and add year
data['date']= pd.to_datetime(data['date']+'/2024')

In [33]:
#High Temp Column
#Remove temperature symbol and make data type numeric
data['high_temp']= pd.to_numeric(data['high_temp'].str.replace('°', '')) #string replace function

In [35]:
data['high_temp']

0     90
1     85
2     85
3     84
4     86
5     85
6     80
7     77
8     79
9     81
10    81
11    84
12    85
13    88
14    85
15    84
16    85
17    85
18    84
19    80
20    84
21    85
22    85
23    84
24    81
25    80
26    79
27    80
28    82
29    81
30    83
31    82
32    83
33    83
34    81
35    79
36    77
37    76
38    76
39    77
40    77
41    79
42    82
43    76
44    77
Name: high_temp, dtype: int64

In [34]:
#Low Temp Column
#Remove temperature symbol and make data type numeric
data['low_temp']= pd.to_numeric(data['low_temp'].str.replace('°', '').str.replace('/', '')) #string replace function

In [36]:
data['low_temp']

0     74
1     68
2     69
3     69
4     70
5     72
6     71
7     70
8     71
9     70
10    71
11    72
12    73
13    68
14    72
15    71
16    72
17    72
18    71
19    71
20    72
21    72
22    71
23    71
24    70
25    69
26    68
27    69
28    69
29    70
30    70
31    69
32    70
33    70
34    69
35    70
36    68
37    64
38    62
39    68
40    69
41    69
42    68
43    61
44    63
Name: low_temp, dtype: int64

In [37]:
#Water Drop column
#Get rid of \n\n\t\t and %\n\t
data['waterdrop'].str.replace('\n\n\t\t', '').str.replace('%\n\t', '')

0     87
1     65
2      4
3     13
4     25
5     25
6     59
7     59
8     61
9     63
10     1
11     3
12    68
13    10
14    20
15    25
16    25
17    20
18    57
19    62
20     5
21     5
22    20
23    39
24    25
25    25
26    25
27    10
28    20
29    25
30    57
31    55
32    40
33    65
34    59
35    59
36    69
37    10
38    20
39    59
40    55
41    25
42    25
43     8
44    67
Name: waterdrop, dtype: object

In [38]:
#Change column to numeric and pass back to water drop column
data['waterdrop']= pd.to_numeric(data['waterdrop'].str.replace('\n\n\t\t', '').str.replace('%\n\t', ''))

In [40]:
#check dataframe
data

Unnamed: 0,date,high_temp,low_temp,waterdrop
0,2024-07-17,90,74,87
1,2024-07-18,85,68,65
2,2024-07-19,85,69,4
3,2024-07-20,84,69,13
4,2024-07-21,86,70,25
5,2024-07-22,85,72,25
6,2024-07-23,80,71,59
7,2024-07-24,77,70,59
8,2024-07-25,79,71,61
9,2024-07-26,81,70,63
