In [2]:
import pandas as pd
data = pd.read_csv('Titanic-Dataset.csv')



In [3]:
print(data.head())
print(data.describe())
print(data.info())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define features and target
X = data.drop('Survived', axis=1)
y = data['Survived']

# List of features that need to be encoded or scaled
categorical_features = ['Sex', 'Embarked']
numerical_features = ['Age', 'Fare', 'Pclass']

# Create preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing pipelines into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ]
)

# Create and evaluate a model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))


Accuracy: 0.7761194029850746
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.83      0.81       157
           1       0.75      0.69      0.72       111

    accuracy                           0.78       268
   macro avg       0.77      0.76      0.77       268
weighted avg       0.77      0.78      0.77       268



In [5]:
# Example of making predictions with new data
# new_data should be a DataFrame with the same features as X
new_data = pd.DataFrame({
    'Pclass': [1],
    'Sex': ['female'],
    'Age': [29],
    'SibSp': [0],
    'Parch': [0],
    'Fare': [100],
    'Embarked': ['C']
})

predictions = model.predict(new_data)
print('Predictions:', predictions)


Predictions: [1]


In [6]:
# Example of making predictions with new data
# new_data should be a DataFrame with the same features as X
new_data = pd.DataFrame({
    'Pclass': [3],
    'Sex': ['female'],
    'Age': [29],
    'SibSp': [0],
    'Parch': [0],
    'Fare': [100],
    'Embarked': ['C']
})

predictions = model.predict(new_data)
print('Predictions:', predictions)


Predictions: [1]
