# Compile Temperature Data from 2018-2020

Notebook used to visit each folders in "Raw Temperature Data" and to append each datasets together to form a complete list of temperatures recorded islandwide between 2018 and 2020

In [71]:
import os
import re
import pandas as pd
import numpy as np

In [72]:
# Raw Temperature Data Folder
data_directory = os.path.join(os.path.dirname(os.getcwd()), "data", "Raw Temperature Data")

In [73]:
# Concat DataFrames together

df_output = pd.DataFrame()

for folder in os.listdir(data_directory):
    if folder == ".DS_Store":
            continue

    folder_directory = os.path.join(data_directory, folder)
    print(folder)
    print("-"*20)
    count = 0

    for file in os.listdir(folder_directory):
        if file == ".DS_Store":
            continue
        print(file)
        count += 1
        filepath = os.path.join(folder_directory, file)        
        df_monthly = pd.read_csv(filepath, encoding="UTF-8", encoding_errors='ignore')
        raw_col_names = list(df_monthly.columns)
        new_col_names = {c: re.sub("\s+", "_", c.replace("°", "").strip().lower()) for c in raw_col_names}
        df_monthly.rename(new_col_names, inplace=True, axis=1)
        df_output = pd.concat([df_output, df_monthly])

    print(count)
    print("\n")

Ang Mo Kio
--------------------
DAILYDATA_S109_201908.csv
DAILYDATA_S109_201909.csv
DAILYDATA_S109_202009.csv
DAILYDATA_S109_202008.csv
DAILYDATA_S109_201808.csv
DAILYDATA_S109_201809.csv
DAILYDATA_S109_201802.csv
DAILYDATA_S109_201803.csv
DAILYDATA_S109_201801.csv
DAILYDATA_S109_201810.csv
DAILYDATA_S109_201804.csv
DAILYDATA_S109_201805.csv
DAILYDATA_S109_201811.csv
DAILYDATA_S109_201807.csv
DAILYDATA_S109_201812.csv
DAILYDATA_S109_201806.csv
DAILYDATA_S109_202005.csv
DAILYDATA_S109_202011.csv
DAILYDATA_S109_201901.csv
DAILYDATA_S109_202010.csv
DAILYDATA_S109_202004.csv
DAILYDATA_S109_202012.csv
DAILYDATA_S109_202006.csv
DAILYDATA_S109_201902.csv
DAILYDATA_S109_201903.csv
DAILYDATA_S109_202007.csv
DAILYDATA_S109_202003.csv
DAILYDATA_S109_201907.csv
DAILYDATA_S109_201912.csv
DAILYDATA_S109_201906.csv
DAILYDATA_S109_202002.csv
DAILYDATA_S109_201910.csv
DAILYDATA_S109_201904.csv
DAILYDATA_S109_201905.csv
DAILYDATA_S109_201911.csv
DAILYDATA_S109_202001.csv
36


Tengah
--------------------

### Inspect Output DataFrame

In [74]:
df_output.sample(5)

Unnamed: 0,station,year,month,day,daily_rainfall_total_(mm),highest_30_min_rainfall_(mm),highest_60_min_rainfall_(mm),highest_120_min_rainfall_(mm),mean_temperature_(c),maximum_temperature_(c),minimum_temperature_(c),mean_wind_speed_(km/h),max_wind_speed_(km/h)
25,Jurong Island,2020,7,26,0.2,0.2,0.2,0.2,28.3,31.3,26.8,11.9,51.1
25,Sembawang,2019,2,26,0.0,,,,,32.8,25.1,17.3,38.9
14,Tengah,2020,1,15,0.0,0.0,0.0,0.0,,32.4,25.0,13.3,37.1
4,Jurong Island,2018,2,5,0.0,0.0,0.0,0.0,27.7,31.4,25.6,7.6,38.5
13,Ang Mo Kio,2020,12,14,41.6,25.0,30.4,36.2,26.2,32.0,22.9,4.7,33.9


#### Clean Values in Float Columns

In [75]:
def clean_float_value(x):
    if type(x) != float:
        if x == "-":
            return np.nan
        return float(x.strip())
    return x

float_col = [
    "daily_rainfall_total_(mm)",
    "highest_30_min_rainfall_(mm)",
    "highest_60_min_rainfall_(mm)",
    "highest_120_min_rainfall_(mm)",
    "mean_temperature_(c)",
    "maximum_temperature_(c)",
    "minimum_temperature_(c)",
    "mean_wind_speed_(km/h)",
    "max_wind_speed_(km/h)"
]

for col in float_col:
    df_output[col] = df_output[col].apply(
        lambda x: clean_float_value(x)
    )

In [70]:
for col in float_col:
    print("Quantiles:")
    print(df_output[col].quantile([0, 0.25, 0.5, 0.75, 1]))

    pct_missing = df_output["daily_rainfall_total_(mm)"].isnull().sum() / len(df_output) * 100
    print(f"Percentage of Rows Missing: {pct_missing:.2f}%")


Quantiles:
0.00      0.0
0.25      0.0
0.50      0.0
0.75      5.4
1.00    185.2
Name: daily_rainfall_total_(mm), dtype: float64
Percentage of Rows Missing: 2.33%
Quantiles:
0.00     0.0
0.25     0.0
0.50     0.0
0.75     3.4
1.00    64.8
Name: highest_30_min_rainfall_(mm), dtype: float64
Percentage of Rows Missing: 2.33%
Quantiles:
0.00      0.0
0.25      0.0
0.50      0.0
0.75      4.0
1.00    102.6
Name: highest_60_min_rainfall_(mm), dtype: float64
Percentage of Rows Missing: 2.33%
Quantiles:
0.00      0.0
0.25      0.0
0.50      0.0
0.75      4.6
1.00    122.8
Name: highest_120_min_rainfall_(mm), dtype: float64
Percentage of Rows Missing: 2.33%
Quantiles:
0.00    22.2
0.25    27.3
0.50    28.2
0.75    28.9
1.00    31.7
Name: mean_temperature_(c), dtype: float64
Percentage of Rows Missing: 2.33%
Quantiles:
0.00    22.8
0.25    31.0
0.50    32.1
0.75    33.1
1.00    38.0
Name: maximum_temperature_(c), dtype: float64
Percentage of Rows Missing: 2.33%
Quantiles:
0.00    20.5
0.25    24

### Export File

In [82]:
df_output

Unnamed: 0,station,year,month,day,daily_rainfall_total_(mm),highest_30_min_rainfall_(mm),highest_60_min_rainfall_(mm),highest_120_min_rainfall_(mm),mean_temperature_(c),maximum_temperature_(c),minimum_temperature_(c),mean_wind_speed_(km/h),max_wind_speed_(km/h)
0,Ang Mo Kio,2019,8,1,0.0,0.0,0.0,0.0,29.3,32.8,27.2,11.5,30.2
1,Ang Mo Kio,2019,8,2,0.0,0.0,0.0,0.0,29.3,32.7,27.3,13.3,34.9
2,Ang Mo Kio,2019,8,3,0.0,0.0,0.0,0.0,29.5,33.3,27.5,13.3,31.3
3,Ang Mo Kio,2019,8,4,0.0,0.0,0.0,0.0,29.0,31.3,27.4,12.2,31.7
4,Ang Mo Kio,2019,8,5,2.0,2.0,2.0,2.0,29.0,31.3,27.0,12.2,31.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,Choa Chu Kang (South),2019,9,26,0.0,0.0,0.0,0.0,28.2,33.1,24.5,5.4,24.5
26,Choa Chu Kang (South),2019,9,27,4.4,4.2,4.2,4.2,27.8,33.0,26.0,4.7,27.0
27,Choa Chu Kang (South),2019,9,28,0.2,0.2,0.2,0.2,28.1,32.9,24.7,5.0,23.4
28,Choa Chu Kang (South),2019,9,29,10.6,4.8,5.0,9.4,26.9,33.0,24.5,4.3,31.3


In [84]:
df_output.to_csv("../data/daily_temperature_2018_2020.csv", encoding="utf-8", index=False)