In [127]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
# from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# EDA with pandas-profiling
import ydata_profiling as pdp

# Miscellaneous
import warnings
warnings.filterwarnings('ignore')

# Regex
import re

# Load your dataset
df = pd.read_csv("titanic/train.csv")

# # Generate a profiling report
# profile = pdp.ProfileReport(df, title="Pandas Profiling Report")
# profile.to_file("pandas_profiling_report.html")  # Save report as HTML

## Data Preparation

### Data Cleaning

In [128]:
# Load your dataset
df = pd.read_csv("titanic/train.csv")



# Drop the specified columns from the DataFrame
df = df.drop(columns=['PassengerId', 
                      'Ticket',  
                      'Cabin'])


#### Missing Values

In [129]:
# Calculate the number of missing values in each column
missing_count = df.isnull().sum()

# Calculate the percentage of missing values in each column
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Combine the count and percentage into a DataFrame
df_missing = pd.DataFrame({
    'Missing Count': missing_count,
    'Missing Percentage': missing_percentage
})

# Filter the DataFrame to include only columns with missing percentage greater than 0
df_missing = df_missing[df_missing['Missing Percentage'] > 0]

# Optionally, sort the DataFrame by the number of missing values (descending order)
df_missing = df_missing.sort_values(by='Missing Count', ascending=False)

# Display the DataFrame
print(df_missing)


# Fill missing values in 'Embarked' with the most frequent value
most_frequent_embarked = df['Embarked'].mode()[0]
df['Embarked'].fillna(most_frequent_embarked, inplace=True)

# Fill missing values in 'Age' with the median value
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)


          Missing Count  Missing Percentage
Age                 177           19.865320
Embarked              2            0.224467


#### Outlier

In [130]:
# Select only numeric columns from the DataFrame
numeric_columns = df.select_dtypes(include=['number'])

# Calculate the skewness for each numeric column in the DataFrame
skewness_values = numeric_columns.skew()

# Create a DataFrame to store the skewness values
df_skew = pd.DataFrame({
    'Column': skewness_values.index,
    'Skewness': skewness_values.values
})

# Optionally, sort the DataFrame by skewness in descending order
df_skew = df_skew.sort_values(by='Skewness', ascending=False)

# Display the DataFrame with skewness values
print("Skewness of numeric columns:")
print(df_skew)

# Define columns for which to apply capping based on high skewness
columns_to_cap = df_skew[df_skew['Skewness'] > 1]['Column']

# Apply upper capping at the 95th percentile for columns with high skewness
for col in columns_to_cap:
    upper_cap = df[col].quantile(0.95)  # Calculate the 95th percentile
    df[col] = np.where(df[col] > upper_cap, upper_cap, df[col])  # Cap values above the 95th percentile

# Display the new skewness values after capping
print("\nNew skewness after capping:")
print(df[columns_to_cap].skew())


Skewness of numeric columns:
     Column  Skewness
5      Fare  4.787317
3     SibSp  3.695352
4     Parch  2.749117
2       Age  0.510245
0  Survived  0.478523
1    Pclass -0.630548

New skewness after capping:
Fare     1.717339
SibSp    1.938712
Parch    1.679480
dtype: float64


#### Duplicates

In [131]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]

# Display the duplicate rows
print("Duplicate Rows:")
print(duplicate_rows)


Duplicate Rows:
Empty DataFrame
Columns: [Survived, Pclass, Name, Sex, Age, SibSp, Parch, Fare, Embarked]
Index: []


### Feature Engineering

In [132]:
# Define a comprehensive regex pattern to search for a variety of titles
title_pattern = r'\b(Dr|Prof|Ph\.D\.|M\.Sc\.|B\.Sc\.|M\.A\.|B\.A\.|MBA|MD|DDS|DVM|JD|LLD|Sir|Dame|Lord|Lady|Baron|Baroness|Rev\.|Father|Sister|Capt|Col|Major|Lt|Sgt|Admiral|General|Eng\.|Architect|Attorney)\b'

# Apply the pattern to the Name column to extract titles
df['Academic Title'] = df['Name'].apply(lambda x: re.search(title_pattern, x))

# Create a new column 'Title' where 1 indicates the presence of a title and 0 indicates no title
df['Title'] = df['Academic Title'].apply(lambda x: 1 if x else 0)

# Drop the temporary 'Academic Title' column if it's no longer needed
df = df.drop(columns=['Academic Title', 'Name'])


### Data Transformation

In [133]:
# Select only categorical columns from the DataFrame
df['Pclass'] = df['Pclass'].astype('category')
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

# Display the names of all categorical columns
print("Categorical Variables:")
print(categorical_columns)

# Label encode the 'Sex' column
label_encoder = LabelEncoder()
df['Sex'] = label_encoder.fit_transform(df['Sex'])

# One-hot encode the 'Pclass' and 'Embarked' columns
df = pd.get_dummies(df, columns=['Pclass', 'Embarked'], drop_first=False)


# Step 1: Convert all boolean columns to integers (0 and 1)
df = df.applymap(lambda x: 1 if x is True else (0 if x is False else x))

# Step 2: Ensure all columns are numeric
df = df.apply(pd.to_numeric)
# Display the first few rows to verify the transformations
print(df.head())




Categorical Variables:
Index(['Pclass', 'Sex', 'Embarked'], dtype='object')
   Survived  Sex   Age  SibSp  Parch     Fare  Title  Pclass_1  Pclass_2  \
0         0    1  22.0    1.0    0.0   7.2500      0         0         0   
1         1    0  38.0    1.0    0.0  71.2833      0         1         0   
2         1    0  26.0    0.0    0.0   7.9250      0         0         0   
3         1    0  35.0    1.0    0.0  53.1000      0         1         0   
4         0    1  35.0    0.0    0.0   8.0500      0         0         0   

   Pclass_3  Embarked_C  Embarked_Q  Embarked_S  
0         1           0           0           1  
1         0           1           0           0  
2         1           0           0           1  
3         0           0           0           1  
4         1           0           0           1  
