In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
print(train_data.columns)
train_data

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
survived_ratio = train_data['Survived'].sum() / len(train_data['Survived'])
print(survived_ratio)

women_survived = train_data.loc[train_data.Sex == 'female']['Survived'].sum()
print('women_survived:',women_survived)

women = len(train_data.loc[train_data.Sex == 'female'])
print('women:',women)
print('woman ratio:', women_survived/women)

man = train_data.loc[train_data.Sex == 'male']['Survived']
print('man ratio:', sum(man)/len(man))

child = train_data.loc[train_data.Age <= 12]['Survived']
print('child ratio:', sum(child)/len(child))

0.3838383838383838
women_survived: 233
women: 314
woman ratio: 0.7420382165605095
man ratio: 0.18890814558058924
child ratio: 0.5797101449275363


In [4]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

# Explore Categorical Variables

## Drop them

In [5]:
# 1. Drop them
# train_dataDropped = train_data.select_dtypes(exclude=['object'])
# test_dataDropped = test_data.select_dtypes(exclude=['object'])
# train_dataDropped.columns

## Let's try to keep the most valuable ones using OneHotEncoding

In [6]:
train_dataDropped = train_data.drop(columns = ['Name', 'Ticket', 'Cabin'])
test_dataDropped = test_data.drop(columns = ['Name', 'Ticket', 'Cabin'])
train_dataDropped.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

In [7]:
from sklearn.preprocessing import OneHotEncoder

oneHotEncoder = OneHotEncoder(handle_unknown = 'ignore', sparse=False)
train_encoded = oneHotEncoder.fit_transform(train_dataDropped[['Sex', 'Embarked']])
test_encoded = oneHotEncoder.transform(test_dataDropped[['Sex', 'Embarked']])

encoded_columns = oneHotEncoder.get_feature_names_out(['Sex', 'Embarked'])

encoded_train = pd.DataFrame(train_encoded, columns=encoded_columns) 
encoded_test = pd.DataFrame(test_encoded, columns=encoded_columns)

train_dataDropped = pd.concat([train_dataDropped.drop(columns=['Sex', 'Embarked']), encoded_train], axis=1)
test_dataDropped = pd.concat([test_dataDropped.drop(columns=['Sex', 'Embarked']), encoded_test], axis=1)



# Explore Missing Data

In [8]:
missing_values = train_dataDropped.isnull().sum()
print(missing_values)

PassengerId       0
Survived          0
Pclass            0
Age             177
SibSp             0
Parch             0
Fare              0
Sex_female        0
Sex_male          0
Embarked_C        0
Embarked_Q        0
Embarked_S        0
Embarked_nan      0
dtype: int64


In [9]:
print(train_dataDropped.Age.mean())
train_dataDropped = train_dataDropped.fillna(train_dataDropped.mean())

29.69911764705882


In [10]:
train_dataDropped.isnull().sum()

PassengerId     0
Survived        0
Pclass          0
Age             0
SibSp           0
Parch           0
Fare            0
Sex_female      0
Sex_male        0
Embarked_C      0
Embarked_Q      0
Embarked_S      0
Embarked_nan    0
dtype: int64

In [11]:
test_dataDropped = test_dataDropped.fillna(test_dataDropped.mean())

In [12]:
# what we want to predict
y_train = train_dataDropped.Survived

# features
X_train = train_dataDropped.loc[:, train_dataDropped.columns != 'Survived']

X_val = test_dataDropped

# Define Our Model

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

In [14]:
def score_dataset(X_train, y_train, X_val):
    model = RandomForestClassifier(random_state=0)
    model.fit(X_train,y_train)
    predictions = model.predict(X_val)
    output = pd.DataFrame(
    {'PassengerId': test_dataDropped.PassengerId,
    'Survived': predictions
    })
    output.to_csv('submission.csv', index=False)

In [15]:
score_dataset(X_train, y_train, X_val)