In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import operator
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from numpy import unique
from numpy import reshape
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Conv1D, Conv2D, Dense, BatchNormalization, Flatten, MaxPooling1D, Dropout, LSTM
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

In [2]:
path = r"C:\Users\suhas\Documents\careerfoundry\ml\datasets"
df_weather = pd.read_csv(os.path.join(path, "weather-processed.csv"))
df_pleasant = pd.read_csv(os.path.join(path, "pleasant-weather.csv"))
# data heads
df_weather.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [3]:
df_weather.shape

(22950, 170)

In [4]:
# Remove stations without data in pleasant weather
df_weather = df_weather.drop(['GDANSK_cloud_cover', 'GDANSK_humidity', 'GDANSK_precipitation', 'GDANSK_snow_depth', 'GDANSK_temp_mean', 'GDANSK_temp_min', 'GDANSK_temp_max',
                        'ROMA_cloud_cover', 'ROMA_wind_speed', 'ROMA_humidity', 'ROMA_pressure', 'ROMA_sunshine', 'ROMA_temp_mean',
                        'TOURS_wind_speed', 'TOURS_humidity', 'TOURS_pressure', 'TOURS_global_radiation', 'TOURS_precipitation', 'TOURS_temp_mean', 'TOURS_temp_min', 'TOURS_temp_max'], axis=1)
# Drop date,month column since its unnecessary

In [5]:
#df_pleasant.shape

In [6]:
# Check for null values
#df_weather.isnull().sum()
#df_pleasant.isnull().sum()

In [7]:
# Get observation types
observations = ['cloud_cover', 'wind_speed', 'humidity', 'pressure', 'global_radiation', 'precipitation', 'snow_depth', 'sunshine', 'temp_mean', 'temp_min', 'temp_max']

In [8]:
# Create a dictionary to store station count
station_counts = {}

for obs in observations:
    # Select columns related to observation
    columns = [col for col in df_weather.columns if col.endswith(obs)]
    station_counts[obs] = len(columns)

# Print the count of stations for each observation type
print("Number of stations:")
for obs, count in station_counts.items():
    print(f"{obs}: {count} stations")


Number of stations:
cloud_cover: 14 stations
wind_speed: 9 stations
humidity: 14 stations
pressure: 14 stations
global_radiation: 15 stations
precipitation: 15 stations
snow_depth: 6 stations
sunshine: 15 stations
temp_mean: 15 stations
temp_min: 15 stations
temp_max: 15 stations


In [9]:
# Remove snow depth and windspeed due to lacking data
dropped_cols = [col for col in df_weather.columns if '_wind_speed' in col or '_snow_depth' in col]
df_weather = df_weather.drop(dropped_cols, axis=1)

In [10]:
df_weather.shape

(22950, 134)

In [11]:
# Locate missing entry columns
all_columns = df_weather.columns.tolist()
all_columns = [col for col in all_columns] 

# get unique values
weather_stations = set()

# print said stations
for col in all_columns:
    station_name = col.split('_')[0]  # Split the column name at the underscore and take the first part
    weather_stations.add(station_name)
print(weather_stations)

{'KASSEL', 'MADRID', 'MONTH', 'MUNCHENB', 'BASEL', 'BELGRADE', 'DEBILT', 'LJUBLJANA', 'SONNBLICK', 'DATE', 'OSLO', 'STOCKHOLM', 'BUDAPEST', 'VALENTIA', 'DUSSELDORF', 'MAASTRICHT', 'HEATHROW'}


In [12]:
# Find stations missing observations
observations = ['cloud_cover', 'humidity', 'pressure']
missing_stations_by_observation = {}

for obs in observations:
    # Select columns related to the current observation
    columns = [col for col in df_weather.columns if col.endswith(obs)]
    # Extract station names by removing the observation type 
    station_names = set([col.replace(f'_{obs}', '') for col in columns])
    # Identify stations that are in all_stations but missing
    missing_stations = weather_stations - station_names
    # Store the missing station names in dict
    missing_stations_by_observation[obs] = missing_stations

for obs, missing_stations in missing_stations_by_observation.items():
    print(f"\nStations missing from {obs}:")
    if missing_stations:
        for station in missing_stations:
            print(station)
    else:
        print("None")


Stations missing from cloud_cover:
MONTH
DATE
KASSEL

Stations missing from humidity:
MONTH
STOCKHOLM
DATE

Stations missing from pressure:
MONTH
DATE
MUNCHENB


In [13]:
# Fill in data using closest stations
df_weather.columns.get_loc('HEATHROW_temp_max')
# Find the position for stockholm insertion
df_weather.columns.get_loc('STOCKHOLM_cloud_cover') 
# Munchenb pressure
df_weather.columns.get_loc('MUNCHENB_cloud_cover')

# insert new data
df_weather.insert(56,'KASSEL_cloud_cover', df_weather['DUSSELDORF_cloud_cover'])
df_weather.insert(119, 'STOCKHOLM_humidity', df_weather['OSLO_humidity'])
df_weather.insert(94,'MUNCHENB_pressure',df_weather['BASEL_pressure'])


In [14]:
df_weather.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_sunshine,BASEL_temp_mean,BASEL_temp_min,...,STOCKHOLM_temp_max,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,0.85,1.018,0.32,0.09,0.7,6.5,0.8,...,4.9,5,0.88,1.0003,0.45,0.34,4.7,8.5,6.0,10.9
1,19600102,1,6,0.84,1.018,0.36,1.05,1.1,6.1,3.3,...,5.0,7,0.91,1.0007,0.25,0.84,0.7,8.9,5.6,12.1
2,19600103,1,8,0.9,1.018,0.18,0.3,0.0,8.5,5.1,...,4.1,7,0.91,1.0096,0.17,0.08,0.1,10.5,8.1,12.9
3,19600104,1,3,0.92,1.018,0.58,0.0,4.1,6.3,3.8,...,2.3,7,0.86,1.0184,0.13,0.98,0.0,7.4,7.3,10.6
4,19600105,1,6,0.95,1.018,0.65,0.14,5.4,3.0,-0.7,...,4.3,3,0.8,1.0328,0.46,0.0,5.7,5.7,3.0,8.4


In [15]:
df_weather.shape

(22950, 137)

In [16]:
# drop unneeded column from df_pleasant
df_pleasant.drop(columns = 'DATE', inplace = True)


In [17]:
df_pleasant.shape

(22950, 15)

In [18]:
# Export cleaned dataset
df_weather.to_csv(os.path.join(path, 'weather-cleaned.csv'), index=False)

In [19]:
# Reload new dataset we just created
X = pd.read_csv(os.path.join(path,'weather-cleaned.csv'), index_col=False)
y = df_pleasant

# Turn X and y into np arrays
X = np.array(X)
y = np.array(y)

# reshape X
X = X.reshape(-1,15, 9)


In [20]:
X.shape

(23290, 15, 9)

In [21]:
y.shape

(22950, 15)

In [22]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

ValueError: Found input variables with inconsistent numbers of samples: [23290, 22950]

In [None]:
# First CNN
epochs = 20
batch_size = 16
n_hidden = 32

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='softmax')) # Function options

In [None]:
model.summary()

In [None]:
# compile and fit
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2)

In [None]:
# Station names
stations = {
0: 'BASEL',
1: 'BELGRADE',
2: 'BUDAPEST',
3: 'DEBILT',
4: 'DUSSELDORF',
5: 'HEATHROW',
6: 'KASSEL',
7: 'LJUBLJANA',
8: 'MAASTRICHT',
9: 'MADRID',
10: 'MUNCHENB',
11: 'OSLO',
12: 'SONNBLICK',
13: 'STOCKHOLM',
14: 'VALENTIA'

}

In [None]:
# Confusion matrix
def confusion_matrix(y_true, y_pred):
    y_true = pd.Series([stations[y] for y in np.argmax(y_true, axis=1)])
    y_pred = pd.Series([stations[y] for y in np.argmax(y_pred, axis=1)])

    return pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Pred'])
print(confusion_matrix(y_test, model.predict(X_test)))

In [None]:
# Tanh attempt due to poor results
epochs = 20
batch_size = 16
n_hidden = 128

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='tanh')) # Options: sigmoid, tanh, softmax, relu

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2)

In [None]:
print(confusion_matrix(y_test, model.predict(X_test)))

In [None]:
epochs = 16
batch_size = 4
n_hidden = 4

timesteps = len(X_train[0])
input_dim = len(X_train[0][0])
n_classes = len(y_train[0])

model = Sequential()
model.add(Conv1D(n_hidden, kernel_size=2, activation='relu', input_shape=(timesteps, input_dim)))
model.add(Dense(16, activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(n_classes, activation='relu')) # Options: sigmoid, tanh, softmax, relu

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=2)

In [None]:
print(confusion_matrix(y_test, model.predict(X_test)))

In [None]:
print("Model Accuracy: ", accuracy_score(y_test, y_pred))