# ![](https://ga-dash.s3.amazonaws.com/production/assets/logo-9f88ae6c9c3871690e33280fcf557f33.png)  Project 4 - Prediction of Dengue Cases

We first need to import three datasets:
1. Weather Data
2. Google Data
3. Dengue Data

In this notebook, we will import all three sets of data before starting data cleaning in the next notebook.

In [42]:
# importing libraries

import requests
import os

import pandas as pd

import chardet
from io import BytesIO

from pytrends.request import TrendReq

import pickle

## 1. Weather Data

In [16]:
base_url = "http://www.weather.gov.sg/files/dailydata/DAILYDATA_S24_{:04d}{:02d}.csv"
output_directory = "../data/weather_data"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for year in range(1980, 2024):
    for month in range(1, 13):
        url = base_url.format(year, month)
        filename = f"DAILYDATA_S24_{year}{month:02d}.csv"
        filepath = os.path.join(output_directory, filename)

        response = requests.get(url)
        if response.status_code == 200:
            with open(filepath, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded: {filename}")
        else:
            print(f"Failed to download: {filename}")

Downloaded: DAILYDATA_S24_198001.csv
Downloaded: DAILYDATA_S24_198002.csv
Downloaded: DAILYDATA_S24_198003.csv
Downloaded: DAILYDATA_S24_198004.csv
Downloaded: DAILYDATA_S24_198005.csv
Downloaded: DAILYDATA_S24_198006.csv
Downloaded: DAILYDATA_S24_198007.csv
Downloaded: DAILYDATA_S24_198008.csv
Downloaded: DAILYDATA_S24_198009.csv
Downloaded: DAILYDATA_S24_198010.csv
Downloaded: DAILYDATA_S24_198011.csv
Downloaded: DAILYDATA_S24_198012.csv
Downloaded: DAILYDATA_S24_198101.csv
Downloaded: DAILYDATA_S24_198102.csv
Downloaded: DAILYDATA_S24_198103.csv
Downloaded: DAILYDATA_S24_198104.csv
Downloaded: DAILYDATA_S24_198105.csv
Downloaded: DAILYDATA_S24_198106.csv
Downloaded: DAILYDATA_S24_198107.csv
Downloaded: DAILYDATA_S24_198108.csv
Downloaded: DAILYDATA_S24_198109.csv
Downloaded: DAILYDATA_S24_198110.csv
Downloaded: DAILYDATA_S24_198111.csv
Downloaded: DAILYDATA_S24_198112.csv
Downloaded: DAILYDATA_S24_198201.csv
Downloaded: DAILYDATA_S24_198202.csv
Downloaded: DAILYDATA_S24_198203.csv
D

In [46]:
base_url = "http://www.weather.gov.sg/files/dailydata/DAILYDATA_S24_{:04d}{:02d}.csv"
output_directory = "../data/weather_data"

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

csv_files = []  # List to store file paths
master_dfs = []  # List to store individual DataFrames

for year in range(1980, 2024):
    for month in range(1, 13):
        url = base_url.format(year, month)
        response = requests.get(url)
        
        if response.status_code == 200:
            file_name = f"DAILYDATA_S24_{year}{month:02d}.csv"
            csv_files.append(file_name)
            
            # Detect encoding
            encoding = chardet.detect(response.content)['encoding']
            
            # Read content as DataFrame
            month_df = pd.read_csv(BytesIO(response.content), encoding=encoding)
            master_dfs.append(month_df)
            print(f"Downloaded and appended: {year}-{month:02d}")
        else:
            print(f"Failed to download: {year}-{month:02d}")

# Concatenate all individual DataFrames into a single master DataFrame
master_df = pd.concat(master_dfs, ignore_index=True)

# Save the concatenated DataFrame to the master CSV file
master_csv_path = "../data/weather_data/master_weather_data.csv"
master_df.to_csv(master_csv_path, index=False)

print("Master CSV file saved.")

Downloaded and appended: 1980-01
Downloaded and appended: 1980-02
Downloaded and appended: 1980-03
Downloaded and appended: 1980-04
Downloaded and appended: 1980-05
Downloaded and appended: 1980-06
Downloaded and appended: 1980-07
Downloaded and appended: 1980-08
Downloaded and appended: 1980-09
Downloaded and appended: 1980-10
Downloaded and appended: 1980-11
Downloaded and appended: 1980-12
Downloaded and appended: 1981-01
Downloaded and appended: 1981-02
Downloaded and appended: 1981-03
Downloaded and appended: 1981-04
Downloaded and appended: 1981-05
Downloaded and appended: 1981-06
Downloaded and appended: 1981-07
Downloaded and appended: 1981-08
Downloaded and appended: 1981-09
Downloaded and appended: 1981-10
Downloaded and appended: 1981-11
Downloaded and appended: 1981-12
Downloaded and appended: 1982-01
Downloaded and appended: 1982-02
Downloaded and appended: 1982-03
Downloaded and appended: 1982-04
Downloaded and appended: 1982-05
Downloaded and appended: 1982-06
Downloaded

In [47]:
weather = pd.read_csv('../data/weather_data/master_weather_data.csv')

In [48]:
weather

Unnamed: 0,Station,Year,Month,Day,Daily Rainfall Total (mm),Highest 30 Min Rainfall (mm),Highest 60 Min Rainfall (mm),Highest 120 Min Rainfall (mm),Mean Temperature (°C),Maximum Temperature (°C),Minimum Temperature (°C),Mean Wind Speed (km/h),Max Wind Speed (km/h),Highest 30 min Rainfall (mm),Highest 60 min Rainfall (mm),Highest 120 min Rainfall (mm)
0,Changi,1980,1,1,0.0,—,—,—,—,—,—,—,—,,,
1,Changi,1980,1,2,0.0,—,—,—,—,—,—,—,—,,,
2,Changi,1980,1,3,0.0,—,—,—,—,—,—,—,—,,,
3,Changi,1980,1,4,0.0,—,—,—,—,—,—,—,—,,,
4,Changi,1980,1,5,8.0,—,—,—,—,—,—,—,—,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15913,Changi,2023,7,27,0.0,,,,30.0,32.4,28.4,14.7,33.3,0.0,0.0,0.0
15914,Changi,2023,7,28,0.2,,,,30.0,32.6,28.4,14.0,31.5,0.2,0.2,0.2
15915,Changi,2023,7,29,0.0,,,,29.8,32.4,28.4,20.9,42.6,0.0,0.0,0.0
15916,Changi,2023,7,30,0.0,,,,29.7,32.8,28.1,20.2,46.3,0.0,0.0,0.0


In [49]:
weather.isnull().sum()

Station                              0
Year                                 0
Month                                0
Day                                  0
Daily Rainfall Total (mm)            0
Highest 30 Min Rainfall (mm)      1217
Highest 60 Min Rainfall (mm)      1217
Highest 120 Min Rainfall (mm)     1217
Mean Temperature (°C)                0
Maximum Temperature (°C)             0
Minimum Temperature (°C)             0
Mean Wind Speed (km/h)               0
Max Wind Speed (km/h)                0
Highest 30 min Rainfall (mm)     14701
Highest 60 min Rainfall (mm)     14701
Highest 120 min Rainfall (mm)    14701
dtype: int64

In [50]:
# Saving the file as pickle
with open('../data/00_Weather_Raw.pickle', 'wb') as file:
    pickle.dump(weather, file)

## 2. Google Data

In [32]:
# Set up the pytrends object
pytrends = TrendReq(hl='en-US', tz=360)

# Define search term and geographical location
search_term = 'dengue'
geo_location = 'SG'

# Build the payload
pytrends.build_payload([search_term], cat=0, timeframe='all', geo=geo_location, gprop='')

# Get the weekly search interest data
interest_data = pytrends.interest_over_time()

# Save the data to a CSV file
interest_data.to_csv('../data/google_data/dengue_search_interest.csv', index=True)

In [33]:
google = pd.read_csv('../data/google_data/dengue_search_interest.csv')

In [34]:
google

Unnamed: 0,date,dengue,isPartial
0,2004-01-01,0,False
1,2004-02-01,20,False
2,2004-03-01,0,False
3,2004-04-01,14,False
4,2004-05-01,6,False
...,...,...,...
231,2023-04-01,10,False
232,2023-05-01,10,False
233,2023-06-01,12,False
234,2023-07-01,12,False


In [35]:
google['isPartial'].value_counts()

False    235
True       1
Name: isPartial, dtype: int64

In [37]:
google['dengue'].unique()

array([  0,  20,  14,   6,  12,   8,  16,  10,  15,  23,  17,  19,  22,
        30, 100,  62,  18,  11,  13,   7,   4,  33,  43,  21,   9,   5,
        35,  38,  64,  34,  27,  24,  32,  31,  25,  29,  26,  46,  57,
        41,  39])

In [44]:
# Saving the file as pickle
with open('../data/00_Google_Raw.pickle', 'wb') as file:
    pickle.dump(google, file)

## 3. Dengue Data

In [39]:
dengue = pd.read_csv('../data/dengue_data/WeeklyNumberofDengueandDengueHaemorrhagicFeverCases.csv')

In [40]:
dengue

Unnamed: 0,year,eweek,type_dengue,number
0,2014,1,Dengue,436.0
1,2014,1,DHF,1.0
2,2014,2,Dengue,479.0
3,2014,2,DHF,0.0
4,2014,3,Dengue,401.0
...,...,...,...,...
525,2018,51,DHF,1.0
526,2018,52,Dengue,160.0
527,2018,52,DHF,0.0
528,2018,53,Dengue,


In [45]:
# Saving the file as pickle
with open('../data/00_Dengue_Raw.pickle', 'wb') as file:
    pickle.dump(dengue, file)