Name Abhishek Gupta
Uid 2021700027
Class Te Ds
Uid 2021700027
Exp-6

These lines import necessary libraries and modules from scikit-learn and pandas. pandas is used for data manipulation, and scikit-learn modules are used for various tasks such as preprocessing, model training, and evaluation.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error


In [2]:
# Load the data from the CSV file
data = pd.read_csv("traffic.csv")

In [4]:
# Convert 'DateTime' column to datetime object
data['DateTime'] = pd.to_datetime(data['DateTime'])

In [5]:
# Extract numerical features from datetime
data['Year'] = data['DateTime'].dt.year
data['Month'] = data['DateTime'].dt.month
data['Day'] = data['DateTime'].dt.day
data['Hour'] = data['DateTime'].dt.hour

In [6]:
# Split the data into features (X) and target variable (y)
X = data.drop(columns=['Vehicles'])  # Use all columns except 'Vehicles' as features
y = data['Vehicles']


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define preprocessing steps for numerical and categorical features
numeric_features = ['Year', 'Month', 'Day', 'Hour']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by replacing with mean
    ('scaler', StandardScaler())  # Scale features
])
categorical_features = ['Junction']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values by replacing with most frequent value
    ('onehot', OneHotEncoder())  # One-hot encode categorical variables
])

In [10]:
# Combine preprocessing steps for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [11]:
# Hyperparameter Tuning
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9],  # Number of neighbors
    'knn__weights': ['uniform', 'distance'],  # Weight function used in prediction
    'knn__p': [1, 2]  # Power parameter for Minkowski distance
}

In [12]:
# Create a pipeline with preprocessing and KNN model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('knn', KNeighborsRegressor())])

In [13]:
# Grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'knn__n_neighbors': 3, 'knn__p': 2, 'knn__weights': 'uniform'}


In [14]:
# Train the KNN model with the best hyperparameters
best_knn = grid_search.best_estimator_
best_knn.fit(X_train, y_train)

In [15]:
# Predict on the test set
y_pred = best_knn.predict(X_test)

In [16]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 27.031610787845203
