In [84]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample  # Alternative to SMOTE


In [5]:
sns.set_style("whitegrid")

In [12]:
# Loading dataset
file_path = "C:\\Users\\Dhanush\\Documents\\CODSOFT\\task 1\\Titanic-Dataset.csv"
df = pd.read_csv(file_path)  

print(df.head())

# Check for missing values
print(df.isnull().sum())

print(df.describe())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
Pa

In [13]:
# Checking for missing values
df.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [20]:
df_majority = df[df['Survived'] == 0]  
df_minority = df[df['Survived'] == 1] 

print(df_majority.shape, df_minority.shape)  


(549, 12) (342, 12)


In [21]:
print(df.dtypes)
print(df.head())  
print(df.shape)   # Check number of rows and columns


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket    

In [17]:
print(df.head()) 
print(df.shape)  


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
(8

In [36]:
df.columns = df.columns.str.strip().str.lower()  # Removing whitespaaces spaces & lowercase all
print(df.columns)  # Verify changes


Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')


In [37]:
df.columns = df.columns.str.strip() 
print(df.columns) 


Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')


In [34]:
df_majority = df[df.get("Survived", pd.Series()) == 0]
df_minority = df[df.get("Survived", pd.Series()) == 1]


In [38]:
print(type(df))  
print(df.head()) 


<class 'pandas.core.frame.DataFrame'>
   passengerid  survived  pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                name     sex   age  sibsp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   parch            ticket     fare cabin embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0         

In [39]:
print(df.head())

   passengerid  survived  pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                name     sex   age  sibsp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   parch            ticket     fare cabin embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [40]:
df_majority = df[df.get("survived", pd.Series()) == 0]
df_minority = df[df.get("survived", pd.Series()) == 1]


In [42]:
df_majority = df[df.survived == 0]  
df_minority = df[df.survived == 1]  

# Oversample the minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,  
                                 n_samples=len(df_majority), 
                                 random_state=42)  

df_balanced = pd.concat([df_majority, df_minority_upsampled])

#new class distribution
df_balanced['survived'].value_counts()


survived
0    549
1    549
Name: count, dtype: int64

In [43]:
df_majority = df[df["survived"] == 0] 
df_minority = df[df["survived"] == 1] 

df_minority_upsampled = resample(df_minority, 
                                 replace=True,   
                                 n_samples=len(df_majority), 
                                 random_state=42)  

df_balanced = pd.concat([df_majority, df_minority_upsampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_balanced["survived"].value_counts())


survived
0    549
1    549
Name: count, dtype: int64


In [47]:
print(X.isnull().sum())  # If any column has NaN, we handle it


passengerid      0
pclass           0
name             0
sex              0
age            213
sibsp            0
parch            0
ticket           0
fare             0
cabin          810
embarked_Q       0
embarked_S       0
dtype: int64


In [50]:
print(X.isnull().sum())  # Checking if any column still has NaN values


passengerid       0
pclass            0
name           1098
sex            1098
age               0
sibsp             0
parch             0
ticket            0
fare              0
cabin          1098
embarked_Q        0
embarked_S        0
dtype: int64


In [51]:
X.drop(["name", "cabin", "ticket"], axis=1, inplace=True)


In [52]:
X["sex"] = X["sex"].map({"male": 0, "female": 1})  # Converting male to 0, female to 1


In [54]:
print(X.isnull().sum()) 


passengerid       0
pclass            0
sex            1098
age               0
sibsp             0
parch             0
fare              0
embarked_Q        0
embarked_S        0
dtype: int64


In [56]:
print(X["sex"].unique()


[nan]


In [59]:
df.rename(columns={"Sex": "sex"}, inplace=True)


In [65]:
print(df.head())  
print(df.columns) 

   passengerid  survived  pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                name     sex   age  sibsp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   parch            ticket     fare cabin embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
In

In [67]:
df["Sex"] = df["sex"].map({"male": 0, "female": 1})


In [69]:
print(df["Sex"].dtype)
print(df["Sex"].unique()) 


int64
[0 1]


In [71]:
df["sex"] = df["sex"].astype(str).str.lower().str.strip()


In [72]:
print(df["sex"].unique())


['male' 'female']


In [73]:
df["sex"] = df["sex"].map({"male": 0, "female": 1})


In [74]:
print(df["Sex"].isnull().sum())  
df["Sex"].fillna(df["Sex"].mode()[0], inplace=True) 


0


In [75]:
print(df["Sex"].unique())  # [0, 1]
print(df.isnull().sum())  


[0 1]
passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
Sex              0
dtype: int64


In [76]:
df["age"].fillna(df["age"].median(), inplace=True)
df["embarked"].fillna(df["embarked"].mode()[0], inplace=True)
df.drop(columns=["cabin"], inplace=True)  # Droping cabin as there are too many missing values 


In [77]:
X = df.drop("survived", axis=1)
y = df["survived"]


In [79]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["survived"])  
y = df["survived"] 

# Split into training and testing sets 80% train, 20% test 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [86]:
import numpy as np
from sklearn.preprocessing import StandardScaler

import pandas as pd
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

X_train[np.isinf(X_train)] = np.nan
X_test[np.isinf(X_test)] = np.nan

X_train = np.nan_to_num(X_train, nan=np.nanmedian(X_train))
X_test = np.nan_to_num(X_test, nan=np.nanmedian(X_test))

# Applying StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Logistic Regression Model
model = LogisticRegression()

# Training the model
model.fit(X_train, y_train)


In [88]:
# Prediction on test data
y_pred = model.predict(X_test)


In [89]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Model Accuracy: {accuracy:.2f}")

print("\nClassification Report:\n", classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Logistic Regression Model Accuracy: 0.81

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.88      0.85       110
           1       0.79      0.70      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.81      0.81      0.81       179


Confusion Matrix:
 [[97 13]
 [21 48]]


In [90]:
##

In [91]:
from sklearn.ensemble import RandomForestClassifier

#Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Model Accuracy: 0.82

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86       110
           1       0.81      0.70      0.75        69

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179


Confusion Matrix:
 [[99 11]
 [21 48]]


In [92]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10]
}

# Grid Search
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_

y_pred_best_rf = best_rf_model.predict(X_test)
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
print(f"Tuned Random Forest Model Accuracy: {accuracy_best_rf:.2f}")


Tuned Random Forest Model Accuracy: 0.80


In [None]:
df['family_size'] = df['sibsp'] + df['parch']

In [None]:
from sklearn.utils import resample

df_majority = df[df['Survived'] == 0]
df_minority = df[df['Survived'] == 1]

df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

df_balanced = pd.concat([df_majority_downsampled, df_minority])

print(df_balanced['Survived'].value_counts())


In [93]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Model Accuracy: {accuracy_rf:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest Model Accuracy: 0.82

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.90      0.86       110
           1       0.81      0.70      0.75        69

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.82       179


Confusion Matrix:
 [[99 11]
 [21 48]]


In [None]:
Final Verdict:
Random Forest performs slightly better than Logistic Regression.
Both models struggle with recall for survivors → indicating some survivors are misclassified.
Further improvements (feature engineering, hyperparameter tuning) can enhance performance.
