# **Task_28**

In [1]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the Titanic dataset
titanic = sns.load_dataset('titanic')

In [3]:
# Drop rows with missing values in relevant columns
titanic = titanic.dropna(subset=['age', 'sex', 'fare', 'class', 'embarked', 'survived'])



In [4]:
# Convert categorical features to numerical
titanic['sex'] = titanic['sex'].map({'male': 0, 'female': 1})
titanic['class'] = titanic['class'].map({'First': 1, 'Second': 2, 'Third': 3})
titanic['embarked'] = titanic['embarked'].map({'C': 0, 'Q': 1, 'S': 2})


In [7]:
# Define features and target variable
X = titanic[['age', 'sex', 'fare', 'class', 'embarked']]
y = titanic['survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [6]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **Cross-Validation**

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Train a logistic regression model
model = LogisticRegression()

# Perform cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())


Cross-Validation Scores: [0.71       0.85       0.84       0.76767677 0.80808081]
Mean Cross-Validation Score: 0.7951515151515152


**Cross-Validation:** Helps in assessing the generalization ability of the model

# **Overfitting**

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Train a more complex model (Random Forest)
complex_model = RandomForestClassifier(n_estimators=100, random_state=42)
complex_model.fit(X_train, y_train)

# Evaluate on training set
train_score = complex_model.score(X_train, y_train)
print("Training Score (Random Forest):", train_score)

# Evaluate on testing set
test_score = complex_model.score(X_test, y_test)
print("Testing Score (Random Forest):", test_score)


Training Score (Random Forest): 0.9919678714859438
Testing Score (Random Forest): 0.7616822429906542


**Overfitting:**Demonstrated by a complex model (Random Forest) performing significantly better on the training set than on the testing set.

# **Underfitting**

In [10]:
from sklearn.tree import DecisionTreeClassifier

# Train a simpler model (Decision Tree with max depth 1)
simple_model = DecisionTreeClassifier(max_depth=1, random_state=42)
simple_model.fit(X_train, y_train)

# Evaluate on training set
train_score_simple = simple_model.score(X_train, y_train)
print("Training Score (Decision Tree):", train_score_simple)

# Evaluate on testing set
test_score_simple = simple_model.score(X_test, y_test)
print("Testing Score (Decision Tree):", test_score_simple)


Training Score (Decision Tree): 0.7911646586345381
Testing Score (Decision Tree): 0.7523364485981309


**Underfitting:** Demonstrated by a simple model (Decision Tree with max depth 1) performing poorly on both the training set and the testing set