# 1. Import Packages and Read Files

In [56]:
#Import packages
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer

# this setting widens how many characters pandas will display in a column:
pd.options.display.max_colwidth = 400

In [57]:
#Read file as stations
stations = pd.read_csv('stations.csv')

In [58]:
# Convert all columns to lowercase and replace spaces in column names.
stations.columns = stations.columns.str.lower().str.replace(' ', '_')

In [59]:
# Check for missing values.
stations.isnull().sum()

unnamed:_0      0
station_code    0
station_name    0
longitude       0
latitude        0
dtype: int64

In [60]:
#Check data in stations
stations.head()

Unnamed: 0,unnamed:_0,station_code,station_name,longitude,latitude
0,0,S06,Paya Lebar,103.903733,1.357133
1,1,S07,Macritchie Reservoir,103.8339,1.3418
2,2,S08,Lower Peirce Reservoir,103.827067,1.370033
3,3,S101,Jurong (North),103.7134,1.350533
4,4,S102,Semakau Island,103.765717,1.190167


In [61]:
#drop unnamed column
stations.drop(['unnamed:_0'], axis=1, inplace=True)

In [62]:
#Check data again
stations.head()

Unnamed: 0,station_code,station_name,longitude,latitude
0,S06,Paya Lebar,103.903733,1.357133
1,S07,Macritchie Reservoir,103.8339,1.3418
2,S08,Lower Peirce Reservoir,103.827067,1.370033
3,S101,Jurong (North),103.7134,1.350533
4,S102,Semakau Island,103.765717,1.190167


In [63]:
#Check shape in mss
stations.shape

(63, 4)

In [64]:
#Read file as mss
mss = pd.read_csv('mss_data.csv')

In [65]:
# Convert all columns to lowercase and replace spaces in column names.
mss.columns = mss.columns.str.lower().str.replace(' ', '_')

In [66]:
#Check data in mss
mss.head()

Unnamed: 0,unnamed:_0,station,year,month,day,daily_rainfall_total,highest_30_min_rainfall,highest_60_min_rainfall,highest_120_min_rainfall,mean_temperature,maximum_temperature,minimum_temperature,mean_wind_speed,max_wind_speed
0,0,Paya Lebar,2009.0,1.0,1.0,0.0,,,,,32.7,24.0,8.4,
1,1,Paya Lebar,2009.0,1.0,2.0,0.5,,,,,30.5,24.0,12.3,
2,2,Paya Lebar,2009.0,1.0,3.0,0.0,,,,,30.1,25.0,13.5,
3,3,Paya Lebar,2009.0,1.0,4.0,1.6,,,,,30.2,24.6,13.0,
4,4,Paya Lebar,2009.0,1.0,5.0,2.5,,,,,32.3,25.2,15.4,


In [67]:
#drop unnamed column
mss.drop(['unnamed:_0'], axis=1, inplace=True)

In [55]:
# Check for missing values.
mss.isnull().sum()

station                          0
year                           394
month                          394
day                            394
daily_rainfall_total          8929
highest_30_min_rainfall     118965
highest_60_min_rainfall     119064
highest_120_min_rainfall    119061
mean_temperature            197072
maximum_temperature         189395
minimum_temperature         189437
mean_wind_speed             189406
max_wind_speed              190440
dtype: int64

In [14]:
#Check data in mss
mss.head()

Unnamed: 0,Station,Year,Month,Day,Daily Rainfall Total,Highest 30 Min Rainfall,Highest 60 Min Rainfall,Highest 120 Min Rainfall,Mean Temperature,Maximum Temperature,Minimum Temperature,Mean Wind Speed,Max Wind Speed
0,Paya Lebar,2009.0,1.0,1.0,0.0,,,,,32.7,24.0,8.4,
1,Paya Lebar,2009.0,1.0,2.0,0.5,,,,,30.5,24.0,12.3,
2,Paya Lebar,2009.0,1.0,3.0,0.0,,,,,30.1,25.0,13.5,
3,Paya Lebar,2009.0,1.0,4.0,1.6,,,,,30.2,24.6,13.0,
4,Paya Lebar,2009.0,1.0,5.0,2.5,,,,,32.3,25.2,15.4,


In [16]:
#Check shape in mss
mss.shape

(283907, 13)

In [20]:
#Check column header for station
for col in stations.columns:
    print(col)

station_code
station_name
longitude
latitude


In [21]:
#Check column header for mss
for col in mss.columns:
    print(col)

Station
Year
Month
Day
Daily Rainfall Total
Highest 30 Min Rainfall
Highest 60 Min Rainfall
Highest 120 Min Rainfall
Mean Temperature
Maximum Temperature
Minimum Temperature
Mean Wind Speed
Max Wind Speed


In [29]:
#To check if station name (in stations) and Station (in mss) can be used to map
print(stations['station_name'].unique()), print(mss['Station'].unique()) 
print(stations['station_name'].unique().shape), print(mss['Station'].unique().shape) 

['Paya Lebar' 'Macritchie Reservoir' 'Lower Peirce Reservoir'
 'Jurong (North)' 'Semakau Island' 'Admiralty' 'Admiralty West'
 'Pulau Ubin' 'East Coast Parkway' 'Marina Barrage' 'Ang Mo Kio'
 'Choa Chu Kang (West)' 'Serangoon North' 'Newton' 'Lim Chu Kang'
 'Marine Parade' 'Choa Chu Kang (Central)' 'Tuas South' 'Pasir Panjang'
 'Jurong Island' 'Dhoby Ghaut' 'Nicoll Highway' 'Botanic Garden'
 'Choa Chu Kang (South)' 'Khatib' 'Whampoa' 'Tengah' 'Changi' 'Seletar'
 'Pasir Ris (West)' 'Kampong Bahru' 'Jurong Pier' 'Ulu Pandan' 'Serangoon'
 'Jurong (East)' 'Mandai' 'Tai Seng' 'Jurong (West)' 'Upper Thomson'
 'Clementi' 'Buangkok' 'Sentosa Island' 'Chai Chee' 'Boon Lay (West)'
 'Bukit Panjang' 'Kranji Reservoir' 'Upper Peirce Reservoir' 'Kent Ridge'
 'Tanjong Pagar' 'Queenstown' 'Tanjong Katong' 'Somerset (Road)'
 'Sembawang' 'Punggol' 'Tuas West' 'Simei' 'Boon Lay (East)' 'Toa Payoh'
 'Tuas' 'Bukit Timah' 'Yishun' 'Buona Vista' 'Pasir Ris (Central)']
['Paya Lebar' 'Macritchie Reservoir' 'Lo

(None, None)

In [32]:
Combined_1 = stations.merge(mss, left_on='station_name', right_on='Station')
Combined_1.head()

Unnamed: 0,station_code,station_name,longitude,latitude,Station,Year,Month,Day,Daily Rainfall Total,Highest 30 Min Rainfall,Highest 60 Min Rainfall,Highest 120 Min Rainfall,Mean Temperature,Maximum Temperature,Minimum Temperature,Mean Wind Speed,Max Wind Speed
0,S06,Paya Lebar,103.903733,1.357133,Paya Lebar,2009.0,1.0,1.0,0.0,,,,,32.7,24.0,8.4,
1,S06,Paya Lebar,103.903733,1.357133,Paya Lebar,2009.0,1.0,2.0,0.5,,,,,30.5,24.0,12.3,
2,S06,Paya Lebar,103.903733,1.357133,Paya Lebar,2009.0,1.0,3.0,0.0,,,,,30.1,25.0,13.5,
3,S06,Paya Lebar,103.903733,1.357133,Paya Lebar,2009.0,1.0,4.0,1.6,,,,,30.2,24.6,13.0,
4,S06,Paya Lebar,103.903733,1.357133,Paya Lebar,2009.0,1.0,5.0,2.5,,,,,32.3,25.2,15.4,


In [33]:
Combined_1.shape

(283907, 17)