In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
mapping = {'Don': 'RoyaltyM', 'Dona': 'RoyaltyF', 'Mme': 'Miss', 'Ms': 'Miss',
    'Major': 'Officer', 'Lady': 'RoyaltyF', 'Sir': 'RoyaltyM', 
    'Mlle': 'Miss', 'Col': 'Officer', 'Capt': 'Officer',
    'Countess': 'RoyaltyF', 'Jonkheer': 'RoyaltyM'}

In [3]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [4]:
def populate_embarked(df):
    embarked = df.groupby('Embarked').count()['PassengerId']
    embarked_max = embarked[embarked == embarked.max()].index[0]
    df.loc[df['Embarked'].isnull(), 'Embarked'] = embarked_max
    return df

In [5]:
def set_titles(df, mapping):
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.')
    df.replace({'Title': mapping}, inplace=True)
    return df

In [6]:
def populate_age(df):
    age_med = df.groupby('Title')['Age'].median()
    for title, age in age_med.iteritems():
        filter_age = df['Age'].isnull()
        filter_title = df['Title'] == title
        df.loc[(filter_age & filter_title), 'Age'] = age
    return df

In [7]:
train = populate_embarked(train)
train = set_titles(train, mapping)
train = populate_age(train)

In [8]:
test = populate_embarked(test)
test = set_titles(test, mapping)
test = populate_age(test)

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Title        418 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


In [10]:
test['Fare'].fillna(test['Fare'].median(), inplace=True)

In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          418 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
 11  Title        418 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 39.3+ KB


In [12]:
train = train.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [13]:
result = pd.DataFrame(test['PassengerId'])
test = test.drop(['PassengerId', 'Name','Ticket','Cabin'], axis=1)

In [14]:
le = LabelEncoder()
classes = {}

In [15]:
le.fit(train['Sex'])
classes['Sex'] = le.classes_
train['Sex'] = le.transform(train['Sex'])

le.fit(train['Embarked'])
classes['Embarked'] = le.classes_
train['Embarked'] = le.transform(train['Embarked'])

le.fit(train['Title'])
classes['Title'] = le.classes_
train['Title'] = le.transform(train['Title'])

In [16]:
classes

{'Sex': array(['female', 'male'], dtype=object),
 'Embarked': array(['C', 'Q', 'S'], dtype=object),
 'Title': array(['Dr', 'Master', 'Miss', 'Mr', 'Mrs', 'Officer', 'Rev', 'RoyaltyF',
        'RoyaltyM'], dtype=object)}

In [17]:
le.fit(classes['Sex'])
test['Sex'] = le.transform(test['Sex'])

le.fit(classes['Embarked'])
test['Embarked'] = le.transform(test['Embarked'])

le.fit(classes['Title'])
test['Title'] = le.transform(test['Title'])

In [19]:
target = train['Survived']
train = train.drop(['Survived'], axis=1)

In [20]:
model_rf = RandomForestClassifier(n_estimators=100)
model_kn = KNeighborsClassifier(n_neighbors=20)

In [26]:
scores = cross_val_score(model_rf, train, target, cv=5)
print(scores)
print(scores.mean())

[0.77653631 0.78651685 0.85393258 0.75280899 0.81460674]
0.7968802962776975


In [25]:
scores = cross_val_score(model_kn, train, target, cv=5)
print(scores)
print(scores.mean())

[0.60335196 0.73595506 0.71348315 0.73033708 0.73595506]
0.7038164584771828


In [27]:
model_rf = model_rf.fit(train, target)
predictions = model_rf.predict(test)

In [29]:
predictions.shape

(418,)

In [31]:
result.insert(1, 'Survived', predictions)

ValueError: cannot insert Survived, already exists

In [32]:
result.to_csv('titanic/result.csv', index=False)