In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing

In [2]:
import pandas as pd

def clean(df):
    # Create a copy of the DataFrame to avoid warnings
    df = df.copy()
    
    columns_to_drop = ["Ticket", "Cabin", "Name", "PassengerId"]
    
    print("Columns in the DataFrame:", df.columns.tolist())
    columns_to_drop = [col for col in columns_to_drop if col in df.columns]
    print("Columns being dropped:", columns_to_drop)
    
    # Drop the columns
    df = df.drop(columns=columns_to_drop)
    
    # Fill missing values
    numeric_cols = ["SibSp", "Parch", "Fare", "Age"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    
    if "Embarked" in df.columns:
        df["Embarked"] = df["Embarked"].fillna("U")
    
    return df

# Load the data
training_set = pd.read_csv('data/train.csv')
test_set = pd.read_csv('data/test.csv')
test_ids=test_set["PassengerId"]
# Clean the datasets
try:
    training_set = clean(training_set)
    test_set = clean(test_set)
    print("Cleaning completed successfully.")
    
    # Print info about the cleaned datasets
    print("\nTraining set info:")
    print(training_set.info())
    print("\nTest set info:")
    print(test_set.info())
except Exception as e:
    print(f"An error occurred: {e}")

Columns in the DataFrame: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
Columns being dropped: ['Ticket', 'Cabin', 'Name', 'PassengerId']
Columns in the DataFrame: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
Columns being dropped: ['Ticket', 'Cabin', 'Name', 'PassengerId']
Cleaning completed successfully.

Training set info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory 

In [3]:
training_set.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [4]:
le = preprocessing.LabelEncoder()
cols = ['Sex','Embarked']

In [5]:
for col in cols:
    training_set[col] = le.fit_transform(training_set[col])
    test_set[col] = le.transform(test_set[col])
    print(le.classes_)
training_set.head(5)


['female' 'male']
['C' 'Q' 'S' 'U']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
Y=training_set["Survived"]
X = training_set.drop("Survived", axis=1)
X_train, X_val, Y_train, Y_val = train_test_split(X,Y,test_size=0.2,random_state=42)

In [8]:
classifier = LogisticRegression(random_state=42, max_iter=1000).fit(X_train,Y_train)

In [9]:
predictions = classifier.predict(X_val)
from sklearn.metrics import accuracy_score
accuracy_score(Y_val, predictions)

0.8100558659217877

In [10]:
submission_predictions = classifier.predict(test_set)

In [11]:
df = pd.DataFrame({"PassengerId":test_ids.values,
                  "Survived":submission_predictions})

In [23]:
df.to_csv("submission.csv", index=False)