In [212]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

### PART 2

#### -------- Load and prepare the dataset --------

In [213]:
# Load the dataset
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


day_df = pd.read_csv('data/day.csv')
hour_df = pd.read_csv('data/hour.csv')

# Add source column to each dataset
# day_df['source'] = 'day'
# hour_df['source'] = 'hour'

# Merge the datasets
merged_df = pd.concat([day_df, hour_df], ignore_index=True)
merged_df = merged_df.drop(['dteday'], axis=1)

#TEST İÇİN 10 VERİYE İN
# merged_df = merged_df.head(10)

non_numeric_cols = merged_df.select_dtypes(exclude=['int64', 'float64']).columns
if len(non_numeric_cols) > 0:
      merged_df = pd.get_dummies(merged_df, columns=non_numeric_cols, drop_first=True)

merged_df = merged_df.dropna()

# ----------- Feature Engineering -----------
# Define categorical and numerical features
categorical_features = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']
numerical_features = ['temp', 'atemp', 'hum', 'windspeed']

# Create preprocessing for categorical features (one-hot encoding) and numerical features (scaling)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# ---------- features ok -----------------

X = merged_df.drop('cnt', axis=1).to_numpy()
y = merged_df['cnt'].to_numpy().ravel()

scaler = StandardScaler()
X      = scaler.fit_transform(X)

X.shape, y.shape


((17379, 15), (17379,))

In [214]:
merged_df

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,hr
731,1,1,0,1,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16,0.0
732,2,1,0,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40,1.0
733,3,1,0,1,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32,2.0
734,4,1,0,1,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13,3.0
735,5,1,0,1,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18105,17375,1,1,12,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119,19.0
18106,17376,1,1,12,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89,20.0
18107,17377,1,1,12,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90,21.0
18108,17378,1,1,12,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61,22.0


#### -------- Develop custom distance functions ---------

In [215]:
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))


#### -------- KNN regression --------

In [216]:
def knn_predict_regressor(X_train, y_train, X_test, k):
    print('KNN Regressor Start')

    y_pred = []
    for test_point in X_test:
        distances = []

        # Manhattan a göre hesapla
        for i in range(len(X_train)):
            dist = manhattan_distance(test_point, X_train[i])
            distances.append((dist, y_train[i]))
        
        distances.sort(key=lambda x: x[0])
        k_nearest = distances[:k]
        k_nearest_values = [pair[1] for pair in k_nearest]
        
        prediction = (sum(k_nearest_values) / len(k_nearest_values)) 
        
        y_pred.append(prediction)

    print('KNN Regressor End')
    
    return np.array(y_pred)



#### k-fold cross-validation

In [217]:
def k_fold_cross_validation(X, y, k_neighbors, n_folds=3):
    print('K-Fold Cross Validation Start')

    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []
    conf_matrices = []
    run_times = []
    fold_results = []

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    # verileri scale et. bazı değerler küçük bazı büyük. standardize et. 
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    fold_num = 1
    for train_index, test_index in kf.split(X_scaled):
        X_train, X_test = X_scaled[train_index], X_scaled[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        print('y_test:', y_test)
        # y'yi float yap. predict float oldugundan accuracy hesaplanırken hata veriyor.
        y_test = y_test.astype(float)
        print('y_test float:', y_test)
        
        # KNN Çalıştır 
        start_time = time.time()
        y_pred = knn_predict_regressor(X_train, y_train, X_test, k_neighbors)
        end_time = time.time()
        run_time = end_time - start_time
        
        # Performans hesapla
        print('y_test:', y_test)
        print('y_pred:', y_pred)
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        # performansları listelerine ekle
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        conf_matrices.append(conf_matrix)
        run_times.append(run_time)
        
        # tümünü fold result listesinde topla
        fold_results.append({
            'fold': fold_num,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'conf_matrix': conf_matrix,
            'run_time': run_time
        })
        
        fold_num += 1

    print('K-Fold Cross Validation End')
    
    return {
        'accuracy': np.mean(accuracy_scores),
        'precision': np.mean(precision_scores),
        'recall': np.mean(recall_scores),
        'f1': np.mean(f1_scores),
        'conf_matrices': conf_matrices,
        'run_time': np.mean(run_times),
        'fold_results': fold_results
    }


#### Create a function to plot confusion matrix

In [218]:
def plot_confusion_matrix(conf_matrices, title):
    avg_conf_matrix = np.mean(conf_matrices, axis=0)
    plt.figure(figsize=(8, 6))
    
    sns.heatmap(avg_conf_matrix, annot=True, fmt='.2f', cmap='Blues',
                xticklabels=['True (0)', 'False (1)'],
                yticklabels=['True (0)', 'False (1)'])
    
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'Confusion Matrix: {title}')
    # plt.tight_layout()
    plt.show()


#### Run all functions

In [None]:

print("KNN Code starts here")
k = 3

# Çalıştır 
overall_start_time = time.time()
results = k_fold_cross_validation(X, y, k)
overall_end_time = time.time()
overall_run_time = overall_end_time - overall_start_time

print("KNN Code ok. ")


KNN Code starts here
K-Fold Cross Validation Start
y_test: [ 16  13   1 ... 224  89  61]
y_test float: [ 16.  13.   1. ... 224.  89.  61.]
KNN Regressor Start


#### Sonuçlar;

In [None]:
# Fold bazında sonuçları yaz
print("\n --- Results by Fold:")
print("-" * 100)
print(f"{'Fold':<6} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1 Score':<10} {'Run Time':<10}")
print("-" * 100)

for fold_result in results['fold_results']:
    print(f"{fold_result['fold']:<6} {fold_result['accuracy']:.4f} {fold_result['precision']:.4f} {fold_result['recall']:.4f} {fold_result['f1']:.4f} {fold_result['run_time']:.4f}s")

# Modelin sonucu
print("\n--- Results Summary:")
print("-" * 50)
print(f"Configuration: k={k}")
print(f"Average Accuracy: {results['accuracy']:.4f}")
print(f"Average Precision: {results['precision']:.4f}")
print(f"Average Recall: {results['recall']:.4f}")
print(f"Average F1 Score: {results['f1']:.4f}")
print(f"Average Fold Run Time: {results['run_time']:.4f}s")
print(f"Total Run Time: {overall_run_time:.4f}s")

# Grafiği çiz.
plot_confusion_matrix(results['conf_matrices'], f"k={k}")


 --- Results by Fold:
----------------------------------------------------------------------------------------------------
Fold   Accuracy   Precision  Recall     F1 Score   Run Time  
----------------------------------------------------------------------------------------------------
1      0.0000 0.0000 0.0000 0.0000 0.0002s
2      0.0000 0.0000 0.0000 0.0000 0.0002s
3      0.0000 0.0000 0.0000 0.0000 0.0003s

--- Results Summary:
--------------------------------------------------
Configuration: k=3
Average Accuracy: 0.0000
Average Precision: 0.0000
Average Recall: 0.0000
Average F1 Score: 0.0000
Average Fold Run Time: 0.0002s
Total Run Time: 0.0288s


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.