In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('/content/drive/My Drive/machine-learning-project/data/data_original.csv')

# drop id column from df
df = df.drop(columns=['id'])

# Separate features and target variable
X = df.drop(columns=['label'])  # All columns except 'label'
y = df['label']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [2]:
# Initialize KNN with default parameters
knn = KNeighborsClassifier()

# Fit the model to the training data
knn.fit(X_train_scaled, y_train)

# Make predictions
y_pred = knn.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5062111801242236
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.57      0.53       139
           1       0.00      0.00      0.00        27
           2       0.53      0.54      0.54       156

    accuracy                           0.51       322
   macro avg       0.34      0.37      0.36       322
weighted avg       0.47      0.51      0.49       322



In [3]:
# # Set the parameters for GridSearchCV
# param_grid = {
#     'n_neighbors': range(1, 21),  # Test values for k from 1 to 20
#     'weights': ['uniform', 'distance'],  # Two options for weights
#     'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metrics
# }

# # Initialize GridSearchCV with cross-validation
# grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, n_jobs=-1)

# # Fit the GridSearchCV to find the best parameters
# grid_search.fit(X_train_scaled, y_train)

# # Print the best parameters
# print("Best Parameters from Grid Search:", grid_search.best_params_)

# # Get the best estimator (the best KNN model with the optimal parameters)
# best_knn = grid_search.best_estimator_

# # Evaluate the best model
# y_pred_best = best_knn.predict(X_test_scaled)
# print("Optimized Accuracy:", accuracy_score(y_test, y_pred_best))
# print("Optimized Classification Report:\n", classification_report(y_test, y_pred_best))


In [6]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your data
df = pd.read_csv('/content/drive/My Drive/machine-learning-project/data/data_original.csv')

# drop id column from df
df = df.drop(columns=['id'])

# Ensure the target variable is correctly separated (assuming 'label' is the target)
X = df.drop(columns=['label'])  # Features (drop the label column)
y = df['label']  # Target variable

# Handle missing values if needed (e.g., filling with mean, median, or dropping rows)
X.fillna(X.mean(), inplace=True)

# Step 1: Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 2: Apply PCA (you can choose the number of components based on explained variance)
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_pca = pca.fit_transform(X_scaled)

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

# Step 4: Train a Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 5: Make predictions and evaluate the model
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Optional: You can also print the explained variance of each principal component
print("Explained variance ratio of each principal component:", pca.explained_variance_ratio_)

# Optional: Check the number of components
print("Number of components:", pca.n_components_)


Accuracy: 0.5383022774327122
Classification Report:
               precision    recall  f1-score   support

           0       0.52      0.59      0.55       218
           1       0.00      0.00      0.00        42
           2       0.56      0.59      0.57       223

    accuracy                           0.54       483
   macro avg       0.36      0.39      0.38       483
weighted avg       0.49      0.54      0.51       483

Explained variance ratio of each principal component: [0.35999145 0.19980712 0.13982929 0.05443362 0.03695887 0.02360533
 0.01246401 0.01139741 0.00994885 0.00978093 0.00850084 0.00827805
 0.00492581 0.00440486 0.00425295 0.0039036  0.00376875 0.00347895
 0.00300954 0.00261462 0.00246979 0.00233867 0.00208004 0.00193924
 0.00186045 0.00174915 0.00167676 0.00162138 0.00139919 0.00132777
 0.00128967 0.00127262 0.00119817 0.00116781 0.0011174  0.00108111
 0.00098279 0.00094674 0.00092223 0.00090098 0.00085933 0.00084897
 0.00082996 0.00078679 0.00075721 0.0007248

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
