In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [44]:
# Import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
path_to_csv= '/content/drive/MyDrive/Colab Notebooks/tested.csv'

df = pd.read_csv(path_to_csv)

In [48]:
# Selecting predictors (X) and output (Y)
# Assuming 'Survived' is the target variable and other features are predictors
X = df.drop(columns=['Survived'])  # Dropping target column
Y = df['Survived']  # Target column
X.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [10]:
# Check the number of categories in Y
print(f"Number of categories in target Y: {len(Y.unique())}")
print("Binary classification" if len(Y.unique()) == 2 else "Multiclass classification")

Number of categories in target Y: 2
Binary classification


In [12]:
# Preprocessing: Filling missing values or dropping unnecessary columns (example)
# Select only numerical columns for filling missing values
numerical_X = X.select_dtypes(include=['float64', 'int64'])

# Fill missing values in numerical columns with their respective means
X[numerical_X.columns] = numerical_X.fillna(numerical_X.mean())

In [34]:
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),  # Scale numerical features
        ('cat', OneHotEncoder(), categorical_features)  # One-hot encode categorical features
    ]
)

In [35]:
# Set up pipeline with preprocessor and SVM model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('svm', SVC())  # Support Vector Classifier
])

In [36]:
# Define parameter grid for cross-validation
#hyperparameter tuning for the SVM model to find the best combination of parameters for improved performance.
param_grid = {
    'svm__C': [0.1, 1, 10],       # Regularization parameter
    'svm__kernel': ['linear', 'rbf'],  # Kernel type
    'svm__gamma': ['scale', 'auto']  # Kernel coefficient #Controls the influence of individual data points. With scale and auto, it adjusts based on the dataset's features.
}

In [37]:
# Split the dataset into 80/20 training/test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [38]:
# Set up cross-validation with GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=4, scoring='accuracy')
grid_search.fit(X_train, Y_train)

In [39]:
# Print the best parameters found
print(f"Best parameters: {grid_search.best_params_}")

Best parameters: {'svm__C': 0.1, 'svm__gamma': 'scale', 'svm__kernel': 'linear'}


In [40]:
# Evaluate on the test set
Y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f"Test set accuracy: {accuracy:.2f}")


Test set accuracy: 1.00


In [45]:
# Compute F1-score
f1 = f1_score(Y_test, Y_pred)

# Compute precision
precision = precision_score(Y_test, Y_pred)

# Compute recall
recall = recall_score(Y_test, Y_pred)


In [46]:
# Print the results
print(f"Test set accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")


Test set accuracy: 1.00
F1 Score: 1.00
Precision: 1.00
Recall: 1.00


In [42]:
# Classification report
print("Classification Report:")
print(classification_report(Y_test, Y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        34

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84

