# Modling 


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Extract the IDs from the test data
test_ids = test_data['PassengerId']

# Function to preprocess data
def preprocess_data(df):
    # Extract titles from names
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Replace rare titles with 'Rare'
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                       'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Map titles to ordinal values
    title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)
    
    # Convert Sex to numeric
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    # Fill missing Age values
    imputer = SimpleImputer(strategy='median')
    df['Age'] = imputer.fit_transform(df[['Age']])
    
    # Fill missing Embarked values
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2}).astype(int)
    
    # Fill missing Fare values
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # Create new feature FamilySize
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Drop unnecessary columns
    df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    return df

# Preprocess the data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

# Separate features and target
X = train_data.drop('Survived', axis=1)
y = train_data['Survived']

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(random_state=42)
}

# Function to evaluate models using cross-validation with multiple metrics
def evaluate_model(model, X, y, cv=5):
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='weighted'),
        'recall': make_scorer(recall_score, average='weighted'),
        'f1': make_scorer(f1_score, average='weighted')
    }
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)
    return scores

# Evaluate each model using cross-validation
results = {}
for model_name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results[model_name] = {
        'mean_accuracy': np.mean(scores['test_accuracy']),
        'std_accuracy': np.std(scores['test_accuracy']),
        'mean_precision': np.mean(scores['test_precision']),
        'std_precision': np.std(scores['test_precision']),
        'mean_recall': np.mean(scores['test_recall']),
        'std_recall': np.std(scores['test_recall']),
        'mean_f1': np.mean(scores['test_f1']),
        'std_f1': np.std(scores['test_f1'])
    }

# Display the cross-validation results
results_df = pd.DataFrame(results).T
print(results_df)

# Given the minimal difference in F1-scores and better overall performance in other metrics,
# Gradient Boosting can be chosen as the best model.

# Choose Gradient Boosting instead of Logistic Regression
best_model = models["Gradient Boosting"]

# Retrain the best model on the entire training set
best_model.fit(X_train, y_train)

# Predict on the test dataset using the best model
test_predictions = best_model.predict(test_data)

# Create a DataFrame to store the predictions
test_predictions_df = pd.DataFrame({'PassengerId': test_ids, 'Survived': test_predictions})

# Save the predictions to a CSV file
test_predictions_df.to_csv('test_predictions.csv', index=False)

print("Predictions saved to test_predictions.csv")

  df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


                        mean_accuracy  std_accuracy  mean_precision  \
Logistic Regression          0.818783      0.027458        0.817634   
Random Forest                0.800611      0.020646        0.800845   
Gradient Boosting            0.820201      0.007703        0.819618   
K-Nearest Neighbors          0.723372      0.023060        0.721927   
Support Vector Machine       0.678322      0.017733        0.687164   

                        std_precision  mean_recall  std_recall   mean_f1  \
Logistic Regression          0.028035     0.818783    0.027458  0.817201   
Random Forest                0.019265     0.800611    0.020646  0.799739   
Gradient Boosting            0.009113     0.820201    0.007703  0.816762   
K-Nearest Neighbors          0.023271     0.723372    0.023060  0.722304   
Support Vector Machine       0.039842     0.678322    0.017733  0.629891   

                          std_f1  
Logistic Regression     0.028974  
Random Forest           0.019447  
Gradient Bo