In [4]:
# train_knn.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# 1. Load dataset
data_path = "/content/vibe_dataset_balanced_7000.csv"   # change if needed
df = pd.read_csv(data_path)

# 2. Define features and target
target_col = "vibe"
feature_cols = [
    "latitude",
    "longitude",
    "elevation",
    "weather",
    "time_of_day",
    "population",
    "cultural_aspect",
    "language",
]

X = df[feature_cols]
y = df[target_col]

# 3. Separate numeric and categorical features
numeric_features = ["latitude", "longitude", "elevation", "population"]
categorical_features = ["weather", "time_of_day", "cultural_aspect", "language"]

# 4. Preprocessor: scale numeric + one-hot encode categorical
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# 5. Define KNN model
knn = KNeighborsClassifier(
    n_neighbors=5,
    weights="distance",    # gives closer points more importance
    metric="minkowski",    # Euclidean distance by default
)

# 6. Build full pipeline
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("knn", knn),
    ]
)

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,   # keep class balance
)

# 8. Train the model
model.fit(X_train, y_train)

# 9. Evaluate on test data (optional but useful during training)
y_pred = model.predict(X_test)
print(" KNN Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# 10. Save the trained pipeline as a .pkl file
model_path = "knn_vibe_model.pkl"
joblib.dump(model, model_path)
print(f"\n Trained KNN model saved to: {model_path}")


 KNN Test Accuracy: 0.9730848861283644

Classification Report:

              precision    recall  f1-score   support

   backwater       0.99      1.00      0.99        96
       beach       0.96      0.97      0.96        97
        city       0.96      0.96      0.96        97
     coastal       0.98      0.96      0.97        97
coastal-city       0.95      0.99      0.97        97
    cultural       0.99      0.97      0.98        96
      desert       1.00      1.00      1.00        97
      forest       1.00      1.00      1.00        97
    heritage       0.92      0.97      0.94        96
   hill-town       0.98      0.97      0.97        96
    mountain       0.99      0.98      0.98        97
       rural       0.95      0.94      0.94        96
   spiritual       0.99      1.00      0.99        97
 temple-town       0.99      0.99      0.99        96
       urban       0.96      0.91      0.93        97

    accuracy                           0.97      1449
   macro avg    

In [5]:
# test_knn.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib

# -----------------------------
# 1. Load the trained model
# -----------------------------
model_path = "knn_vibe_model.pkl"   # make sure this was trained on the SAME dataset version
model = joblib.load(model_path)
print(f" Loaded trained model from: {model_path}")

# -----------------------------
# 2. Load the dataset
#    IMPORTANT: use the SAME file you used in train_knn.py
# -----------------------------
data_path = "/content/vibe_dataset_balanced_7000.csv"  # or vibe_dataset_7000_balanced.csv or your chosen file
df = pd.read_csv(data_path)

target_col = "vibe"
feature_cols = [
    "latitude",
    "longitude",
    "elevation",
    "weather",
    "time_of_day",
    "population",
    "cultural_aspect",
    "language",
]

X = df[feature_cols]
y = df[target_col]

# -----------------------------
# 3. Recreate the SAME train-test split as in training
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

# -----------------------------
# 4. Example: Predict vibe for a NEW location

# Example: a beach-like location (you can tweak values)
new_location = pd.DataFrame(
    [{
        "latitude": 27.0392,          # Goa-ish
        "longitude": 88.2639,
        "elevation": 2100,              # near sea level
        "weather": "cold",           # typical coastal
        "time_of_day": "afternoon",
        "population": 170000,         # medium tourist town
        "cultural_aspect": "tibetan",
        "language": "hindi",
    }]
)

print("\n Features for new location:")
print(new_location)

predicted_vibe = model.predict(new_location)[0]
print("\n Predicted vibe for new location:", predicted_vibe)


 Loaded trained model from: knn_vibe_model.pkl

 Features for new location:
   latitude  longitude  elevation weather time_of_day  population  \
0   27.0392    88.2639       2100    cold   afternoon      170000   

  cultural_aspect language  
0         tibetan    hindi  

 Predicted vibe for new location: mountain
