In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

# Load the dataset
data = pd.read_csv('train.csv')

# Display the first few rows of the dataset
print(data.head())

# Display summary statistics
print(data.describe())

# Display information about the dataset
print(data.info())

# Check for missing values
print(data.isnull().sum())

# Fill missing values for 'Age' with the median age
data['Age'].fillna(data['Age'].median(), inplace=True)

# Fill missing values for 'Embarked' with the mode
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Drop the 'Cabin' column as it has too many missing values
data.drop(columns=['Cabin'], inplace=True)

# Drop the 'Ticket' and 'PassengerId' columns as they are not useful for prediction
data.drop(columns=['Ticket', 'PassengerId', 'Name'], inplace=True)

# Encode 'Sex' column
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])

# One-hot encode 'Embarked' column
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)

# Features and target variable
X = data.drop(columns=['Survived'])
y = data['Survived']

# Split the data into training and testing sets before applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to handle class imbalance on the training set only
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Scale the data
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest Classifier with limited depth and cross-validation
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

# Evaluate using cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(rf, X_train_res_scaled, y_train_res, cv=cv)

print(f'Cross-Validation Accuracy Scores: {cross_val_scores}')
print(f'Mean Cross-Validation Accuracy: {cross_val_scores.mean()}')

# Train on the whole training set and evaluate on the test set
rf.fit(X_train_res_scaled, y_train_res)
y_pred = rf.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [28]:
import pandas as pd
# Load the Titanic dataset
data = pd.read_csv('train.csv')


# Drop unnecessary columns
data = data.drop(columns=['PassengerId', 'Cabin', 'Ticket'])

# Extract titles from the 'Name' column
data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=False)

# Map similar titles together
title_mapping = {
    'Mr': 'Mr',
    'Miss': 'Miss',
    'Mrs': 'Mrs',
    'Master': 'Master',
    'Dr': 'Rare',
    'Rev': 'Rare',
    'Col': 'Rare',
    'Major': 'Rare',
    'Mlle': 'Miss',
    'Countess': 'Royalty',
    'Ms': 'Miss',
    'Lady': 'Royalty',
    'Jonkheer': 'Royalty',
    'Don': 'Royalty',
    'Dona': 'Royalty',
    'Mme': 'Mrs',
    'Capt': 'Rare',
    'Sir': 'Royalty'
}
data['Title'] = data['Title'].map(title_mapping)

# Drop the 'Name' column as it is no longer needed
data = data.drop(columns=['Name'])

# Fill missing values
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
data['Fare'].fillna(data['Fare'].median(), inplace=True)

# Encode categorical features
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data = pd.get_dummies(data, columns=['Embarked', 'Title'], drop_first=True)

# Analyze survival rates by title
title_survival = data[['Title_Mr', 'Title_Miss', 'Title_Mrs',  'Title_Royalty', 'Title_Rare', 'Survived']]
title_survival_grouped = title_survival.groupby(title_survival.columns.tolist()).size().unstack().fillna(0)

# Display the survival rates by title
print(title_survival_grouped)


Survived                                                  0    1
Title_Mr Title_Miss Title_Mrs Title_Royalty Title_Rare          
False    False      False     False         False        17   23
                                            True         13    5
                              True          False         2    3
                    True      False         False        26  100
         True       False     False         False        55  130
True     False      False     False         False       436   81
