In [None]:
import sys
!{sys.executable} -m pip install liac-arff



In [None]:
import sys
!{sys.executable} -m pip install pandas numpy liac-arff scikit-learn matplotlib seaborn




In [None]:
import pandas as pd
import numpy as np
import arff
from pathlib import Path

print("All imports are working!")


In [None]:

from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import joblib


In [None]:
import arff
from pathlib import Path
import pandas as pd

path = Path("../data/datasetone.csv")  # even if extension says .csv
with open(path, "r") as f:
    raw = arff.load(f)

df = pd.DataFrame(raw["data"], columns=[a[0] for a in raw["attributes"]])
df.head(), df.shape


In [None]:
df.to_csv("../data/datasetone_fixed.csv", index=False)


In [None]:
df = pd.read_csv("../data/datasetone_fixed.csv")
df.head()


In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
df.isna().sum()

In [None]:
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Categorical:", cat_cols)
print("Numeric:", num_cols)


In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
# Columns detected earlier
categorical = cat_cols
numeric = num_cols

In [None]:
# Imputers + encoders
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])


In [None]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric),
        ("cat", categorical_transformer, categorical)
    ]
)

print("Preprocessor ready!")

In [None]:
target =  "Violation.Type"
X = df.drop(columns=[target])
y = df[target]
print("X shape:", X.shape)
print("y shape:", y.shape)
y.value_counts()

In [None]:
# Recompute feature lists from X (target already dropped)
categorical = X.select_dtypes(include=["object"]).columns.tolist()
numeric     = X.select_dtypes(include=["int64","float64"]).columns.tolist()

# (Re)build the preprocessor using feature columns from X
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric),
    ("cat", categorical_transformer, categorical)
])


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

models = {
    "LogReg": LogisticRegression(max_iter=300),
    "RandomForest": RandomForestClassifier(n_estimators=150, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=7)
}

results = {}

for name, model in models.items():
    pipe = Pipeline([("preprocess", preprocessor), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)

    acc = accuracy_score(y_test, preds)
    results[name] = acc

    print("\n============================")
    print(name)
    print("Accuracy:", acc)
    print(classification_report(y_test, preds))


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report


knn_results = []
for k in [3, 5, 11]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn_pipe = Pipeline([("preprocess", preprocessor), ("model", knn)])
    knn_pipe.fit(X_train, y_train)
    knn_preds = knn_pipe.predict(X_test)

    acc = accuracy_score(y_test, knn_preds)
    f1w = f1_score(y_test, knn_preds, average="weighted")
    knn_results.append((k, acc, f1w))

    print(f"\n=== KNN (k={k}) ===")
    print("Accuracy:", round(acc, 4), " | F1(w):", round(f1w, 4))
    print(classification_report(y_test, knn_preds))


In [None]:
print("\n=== SUMMARY OF ALL MODELS (DATASET 1) ===")
print(f"LogReg Accuracy:       {results['LogReg']:.4f}")
print(f"RandomForest Accuracy: {results['RandomForest']:.4f}")

for k, acc, f1 in knn_results:
    print(f"KNN (k={k}) Accuracy:   {acc:.4f} | F1: {f1:.4f}")
