# Modling 


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Extract the IDs from the test data
test_ids = test_data['id']

# Remove id columns as they are likely not useful for prediction
train_data = train_data.drop(columns=['id'])
test_data = test_data.drop(columns=['id'])

# Separate features and target from the training data
X = train_data.drop(columns=['Target'])
y = train_data['Target']

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object', 'int64']).columns

# Perform label encoding on categorical variables present in both datasets
label_encoders = {}
for col in categorical_columns:
    if col in test_data.columns:
        label_encoders[col] = LabelEncoder().fit(pd.concat([X[col], test_data[col]]))
        X[col] = label_encoders[col].transform(X[col])
        test_data[col] = label_encoders[col].transform(test_data[col])

# Handle missing values with SimpleImputer
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)
test_data = imputer.transform(test_data)

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_data = scaler.transform(test_data)

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=42)
}

# Function to evaluate models using cross-validation with multiple metrics
def evaluate_model(model, X, y, cv=5):
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='weighted'),
        'recall': make_scorer(recall_score, average='weighted'),
        'f1': make_scorer(f1_score, average='weighted')
    }
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
    return scores

# Evaluate each model using cross-validation
results = {}
for model_name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results[model_name] = {
        'mean_accuracy': np.mean(scores['test_accuracy']),
        'std_accuracy': np.std(scores['test_accuracy']),
        'mean_precision': np.mean(scores['test_precision']),
        'std_precision': np.std(scores['test_precision']),
        'mean_recall': np.mean(scores['test_recall']),
        'std_recall': np.std(scores['test_recall']),
        'mean_f1': np.mean(scores['test_f1']),
        'std_f1': np.std(scores['test_f1'])
    }

# Display the cross-validation results
results_df = pd.DataFrame(results).T
print(results_df)

# Choose the best model based on cross-validation results (e.g., highest mean F1 score)
best_model_name = results_df['mean_f1'].idxmax()
best_model = models[best_model_name]

# Retrain the best model on the entire training set
best_model.fit(X_train, y_train)

# Predict on the test dataset using the best model
test_predictions = best_model.predict(test_data)

# Create a DataFrame to store the predictions
test_predictions_df = pd.DataFrame({'id': test_ids, 'Target': test_predictions})

# Save the predictions to a CSV file
test_predictions_df.to_csv('test_predictions.csv', index=False)

print("Predictions saved to test_predictions.csv")


                        mean_accuracy  std_accuracy  mean_precision  \
Logistic Regression          0.814667      0.001256        0.811317   
Random Forest                0.824632      0.002490        0.823115   
Gradient Boosting            0.825792      0.003517        0.824904   
K-Nearest Neighbors          0.775199      0.004028        0.769939   
Support Vector Machine       0.819862      0.002738        0.819797   

                        std_precision  mean_recall  std_recall   mean_f1  \
Logistic Regression          0.001234     0.814667    0.001256  0.810798   
Random Forest                0.002105     0.824632    0.002490  0.822407   
Gradient Boosting            0.003203     0.825792    0.003517  0.824041   
K-Nearest Neighbors          0.004652     0.775199    0.004028  0.771380   
Support Vector Machine       0.002373     0.819862    0.002738  0.817119   

                          std_f1  
Logistic Regression     0.001208  
Random Forest           0.002204  
Gradient Bo