# 1. Import Libraries & Data

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import os 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [3]:
# Set path for dataset
path = '/Users/charlottelin/Documents/11-2025 ClimateWins Machine Learning Analysis/02 Data/Raw Data/Dataset-weather-prediction-dataset-processed.csv'

In [4]:
# Load dataset
df = pd.read_csv(path)

In [5]:
# Check import
df.head()

Unnamed: 0,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,BASEL_sunshine,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,0.7,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,1.1,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,0.0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,4.1,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,5.4,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


In [6]:
# Check data dimensions
df.shape

(22950, 170)

In [7]:
# Add an "ID" column for potential use
df.reset_index(inplace=True)
df.rename(columns={'index':'id'}, inplace=True)
df.head()

Unnamed: 0,id,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,0,19600101,1,7,2.1,0.85,1.018,0.32,0.09,0,...,5,0.88,1.0003,0.45,0.34,0,4.7,8.5,6.0,10.9
1,1,19600102,1,6,2.1,0.84,1.018,0.36,1.05,0,...,7,0.91,1.0007,0.25,0.84,0,0.7,8.9,5.6,12.1
2,2,19600103,1,8,2.1,0.9,1.018,0.18,0.3,0,...,7,0.91,1.0096,0.17,0.08,0,0.1,10.5,8.1,12.9
3,3,19600104,1,3,2.1,0.92,1.018,0.58,0.0,0,...,7,0.86,1.0184,0.13,0.98,0,0.0,7.4,7.3,10.6
4,4,19600105,1,6,2.1,0.95,1.018,0.65,0.14,0,...,3,0.8,1.0328,0.46,0.0,0,5.7,5.7,3.0,8.4


# 2. Data Cleaning

In [8]:
# Check for missing values
df.isnull().sum()

id                     0
DATE                   0
MONTH                  0
BASEL_cloud_cover      0
BASEL_wind_speed       0
                      ..
VALENTIA_snow_depth    0
VALENTIA_sunshine      0
VALENTIA_temp_mean     0
VALENTIA_temp_min      0
VALENTIA_temp_max      0
Length: 171, dtype: int64

#### There are no missing values in this dataset. No imputation is needed.

In [9]:
# Check for duplicates
df_dup = df[df.duplicated()]

In [10]:
df_dup

Unnamed: 0,id,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max


#### There are no duplicates in this dataset.

# 3. Data Scaling

In [11]:
# Initiate the scaler
scaler = StandardScaler()

In [12]:
# Identify numeric columns (excluding identifiers)
exclude_cols = ['DATE', 'id']
num_cols = df.drop(columns=[col for col in exclude_cols if col in df.columns]).select_dtypes(include=['float64', 'int64']).columns

In [13]:
# Fit and transform the numeric columns
df[num_cols] = scaler.fit_transform(df[num_cols])

In [14]:
# Verify mean and std
print("Mean of scaled features (should be ~0):")
print(df[num_cols].mean().round(3).head())
print("\nStandard deviation of scaled features (should be ~1):")
print(df[num_cols].std().round(3).head())

Mean of scaled features (should be ~0):
MONTH                0.0
BASEL_cloud_cover   -0.0
BASEL_wind_speed    -0.0
BASEL_humidity       0.0
BASEL_pressure      -0.0
dtype: float64

Standard deviation of scaled features (should be ~1):
MONTH                1.0
BASEL_cloud_cover    1.0
BASEL_wind_speed     1.0
BASEL_humidity       1.0
BASEL_pressure       1.0
dtype: float64


In [15]:
# Verify scaling
df.head()

Unnamed: 0,id,DATE,MONTH,BASEL_cloud_cover,BASEL_wind_speed,BASEL_humidity,BASEL_pressure,BASEL_global_radiation,BASEL_precipitation,BASEL_snow_depth,...,VALENTIA_cloud_cover,VALENTIA_humidity,VALENTIA_pressure,VALENTIA_global_radiation,VALENTIA_precipitation,VALENTIA_snow_depth,VALENTIA_sunshine,VALENTIA_temp_mean,VALENTIA_temp_min,VALENTIA_temp_max
0,0,19600101,-1.599964,0.660514,-0.02793,0.826097,-0.001949,-1.101066,-0.265148,-0.179228,...,-0.443701,0.761754,-1.299744,-0.806427,-0.088407,-0.024706,0.372147,-0.668215,-0.519743,-0.752237
1,1,19600102,-1.599964,0.244897,-0.02793,0.73576,-0.001949,-1.058108,1.65876,-0.179228,...,0.783085,1.18358,-1.262455,-1.042055,0.503361,-0.024706,-0.829285,-0.548046,-0.629054,-0.407141
2,2,19600103,-1.599964,1.07613,-0.02793,1.277781,-0.001949,-1.25142,0.155707,-0.179228,...,0.783085,1.18358,-0.432779,-1.136306,-0.396127,-0.024706,-1.0095,-0.067372,0.054135,-0.177078
3,3,19600104,-1.599964,-1.001953,-0.02793,1.458455,-0.001949,-0.821838,-0.445514,-0.179228,...,0.783085,0.480538,0.387574,-1.183432,0.669056,-0.024706,-1.039536,-0.998679,-0.164486,-0.838511
4,4,19600105,-1.599964,0.244897,-0.02793,1.729466,-0.001949,-0.746661,-0.164944,-0.179228,...,-1.670486,-0.363113,1.72997,-0.794645,-0.49081,-0.024706,0.672505,-1.509396,-1.339569,-1.471186


# 4. Export Dataframe

In [16]:
# Define output path
pickle_path = "/Users/charlottelin/Documents/11-2025 ClimateWins Machine Learning Analysis/02 Data/Prepared Data/Dataset-scaled.pkl"

In [17]:
# Export dataframe as pickle
df.to_pickle(pickle_path)

In [18]:
# Define csv file path
csv_path = "/Users/charlottelin/Documents/11-2025 ClimateWins Machine Learning Analysis/02 Data/Prepared Data/Dataset-weather-prediction-dataset-scaled.csv"

# Export the scaled DataFrame as a CSV file
df.to_csv(csv_path, index=False)