In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [25]:
# load the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Assignment_2/weather_forecast_data.csv")

In [18]:
# Task 1: Preprocessing
# 1. Check for missing values
def checkMissing(df):
    print("Missing values per column:")
    print(df.isnull().sum())
checkMissing(df)

Missing values per column:
Temperature    25
Humidity       40
Wind_Speed     32
Cloud_Cover    33
Pressure       27
Rain            0
dtype: int64


In [None]:
# 2. Handle missing values by two techniques (Use one of them)
# Dropping missing values
df.dropna(inplace=True)
checkMissing(df)

Missing values per column:
Temperature    0
Humidity       0
Wind_Speed     0
Cloud_Cover    0
Pressure       0
Rain           0
dtype: int64


In [26]:
# Replacing them with the average of the feature
# Fills missing values in numeric columns with the column mean
for col in df.select_dtypes(include='number').columns:
  if df[col].isnull().any():
    df[col] = df[col].fillna(df[col].mean())

# Replace missing values in categorical columns with the column mode
for col in df.select_dtypes(include='object').columns:
  if df[col].isnull().any():
    df[col] = df[col].fillna(df[col].mode()[0])

checkMissing(df)
df.head()

Missing values per column:
Temperature    0
Humidity       0
Wind_Speed     0
Cloud_Cover    0
Pressure       0
Rain           0
dtype: int64


Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure,Rain
0,19.096119,71.651723,14.782324,48.699257,987.95476,no rain
1,27.112464,84.183705,13.289986,10.375646,1035.43087,no rain
2,20.433329,42.290424,7.216295,6.673307,1033.628086,no rain
3,19.576659,40.67928,4.568833,55.026758,1038.8323,no rain
4,19.82806,93.353211,0.104489,30.687566,1009.423717,no rain


In [20]:
# Encode the categorical column
encoder = LabelEncoder()
df['Rain'] = encoder.fit_transform(df['Rain'])
print(df.head())

   Temperature   Humidity  Wind_Speed  Cloud_Cover     Pressure  Rain
0    19.096119  71.651723   14.782324    48.699257   987.954760     0
1    27.112464  84.183705   13.289986    10.375646  1035.430870     0
2    20.433329  42.290424    7.216295     6.673307  1033.628086     0
3    19.576659  40.679280    4.568833    55.026758  1038.832300     0
4    19.828060  93.353211    0.104489    30.687566  1009.423717     0


In [28]:
# Split the data into training and testing sets (80% for training, 20% for testing)
X = df.drop(columns=['Rain']) # Features
y = df['Rain']  # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 3. Summary statistics of the numeric columns
print(X_train.describe())

In [23]:
# The dataset does not have the same scale due to the different ranges and magnitudes
# Feature Scaling: StandardScaler
to_scale = X_train.columns
scaler = StandardScaler()
X_train[to_scale] = scaler.fit_transform(X_train[to_scale])
X_test[to_scale] = scaler.transform(X_test[to_scale])

In [29]:
# Feature Scaling: Min-Max Scaling
to_scale = X_train.columns
scaler = MinMaxScaler()
X_train[to_scale] = scaler.fit_transform(X_train[to_scale])
X_test[to_scale] = scaler.transform(X_test[to_scale])