In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier

In [36]:
# load the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Assignment_2/weather_forecast_data.csv")

In [None]:
# 1. Check for missing values
def check_missing(df):
    print("Missing values per column:")
    print(df.isnull().sum())

In [None]:
# 2. Handle missing values
def handle_missing(df, strategy):
    if strategy == "drop":
        return df.dropna()
    elif strategy == "replace":
        df_copy = df.copy()
        # Fill numeric columns with mean
        for col in df_copy.select_dtypes(include="number").columns:
            df_copy[col] = df_copy[col].fillna(df_copy[col].mean())
        # Fill categorical columns with mode
        for col in df_copy.select_dtypes(include="object").columns:
            df_copy[col] = df_copy[col].fillna(df_copy[col].mode()[0])
        return df_copy
    else:
        raise ValueError("Invalid missing value strategy")

In [None]:
# 3. Preprocessing
def preprocess(df, scaling):
    X = df.drop(columns=["Rain"])
    y = df["Rain"]

    # Encode target
    y = LabelEncoder().fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale numeric features
    scaler = MinMaxScaler() if scaling == "min-max" else StandardScaler()
    toscale = X_train.select_dtypes(include="number").columns
    X_train[toscale] = scaler.fit_transform(X_train[toscale])
    X_test[toscale] = scaler.transform(X_test[toscale])

    return X_train, X_test, y_train, y_test

In [None]:
# 4. Evaluate model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return accuracy, precision, recall

## **k-Nearest Neighbors Using scikit-learn Algorithm**

In [67]:
# 5. kNN sklearn
def knn(k_values, X_train, y_train, X_test, y_test):
    results = []
    for k in values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        metrics = evaluate_model(knn, X_test, y_test)
        results.append((k, *metrics))
    return results

# 6. Compare missing value strategies
def compare_strategies_knn(df, k_values):
    techniques = ["drop", "replace"]

    for technique in techniques:
        print(f"\nMissing Value Technique: {technique}")
        df_cleaned = handle_missing(df, technique)

        X_train, X_test, y_train, y_test = preprocess(df_cleaned, "min-max")

        print("Comparison of KNN using sklearn:")
        print("k | Accuracy | Precision | Recall")
        print("---------------------------------")

        results = knn(k_values, X_train, y_train, X_test, y_test)
        for result in results:
          print(f"{result[0]:<2}| {result[1]:.2f}     | {result[2]:.2f}      | {result[3]:.2f}")



df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Assignment_2/weather_forecast_data.csv")
check_missing(df)

values = [3, 5, 7, 10, 13]
compare_strategies_knn(df, values)

Missing values per column:
Temperature    25
Humidity       40
Wind_Speed     32
Cloud_Cover    33
Pressure       27
Rain            0
dtype: int64

Missing Value Technique: drop
Comparison of KNN using sklearn:
k | Accuracy | Precision | Recall
---------------------------------
3 | 0.97     | 0.94      | 0.85
5 | 0.96     | 0.89      | 0.82
7 | 0.96     | 0.90      | 0.81
10| 0.96     | 0.95      | 0.79
13| 0.97     | 0.95      | 0.84

Missing Value Technique: replace
Comparison of KNN using sklearn:
k | Accuracy | Precision | Recall
---------------------------------
3 | 0.97     | 0.89      | 0.84
5 | 0.97     | 0.92      | 0.79
7 | 0.97     | 0.88      | 0.80
10| 0.97     | 0.96      | 0.79
13| 0.97     | 0.92      | 0.84
