In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load train and test data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df = train_df.drop(['Ticket', 'Cabin', 'PassengerId'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)

combine = [train_df, test_df]
train_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [5]:
# Tworzymy zmienna title
for dataset in combine:
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

dataset['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
      dtype=object)

In [6]:
# Replace values with 'Other'
for dataset in combine:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Master', 'Col', 'Rev', 'Dr', 'Dona'], 'Other')

dataset['Title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Other', 'Ms'], dtype=object)

In [7]:
title_mapping = {
    "Mr": 1,
    "Mrs": 2,
    "Miss": 3,
    "Other": 4,
    "Ms": 5,
}

# Change title values to integers
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0).astype(int)

# Drop name column
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

combine = [train_df, test_df]

In [8]:
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,FamilySize
0,0,3,male,22.0,1,0,7.25,S,1,2
1,1,1,female,38.0,1,0,71.2833,C,2,2
2,1,3,female,26.0,0,0,7.925,S,3,1
3,1,1,female,35.0,1,0,53.1,S,2,2
4,0,3,male,35.0,0,0,8.05,S,1,1


In [9]:
for dataset in combine:
    dataset['IsAlone'] = 0
    # Mark alone people as alone
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

# Remove useless columns
train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
combine = [train_df, test_df]

In [10]:
dataset['Sex'].unique()

array(['male', 'female'], dtype=object)

In [11]:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map({"male": 0, "female": 1}).astype(int)

In [12]:
# Most visited port
freq_port = train_df.Embarked.dropna().mode()[0]

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

train_df['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [13]:
# Encoding embarked ports
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map({"S": 0, "C": 1, "Q": 2}).astype(int)

In [14]:
# Over 24% of data is missing in Age column
train_df['Age'].isna().sum() / train_df['Age'].count()

0.24789915966386555

In [15]:
guess_age = np.zeros((2,3))

# Missing data will be replaced with median for each class and sex
for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            # Medians for each class and sex
            guess_df = dataset[(dataset["Sex"] == 1) & (dataset['Pclass'] == j + 1)]['Age'].dropna()
            age_guess = guess_df.median()
            # Round median to nearest int
            guess_age[i,j] = int(age_guess / 0.5 + 0.5) * 0.5
    
    for i in range(2):
        for j in range(3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j + 1), 'Age'] = guess_age[i,j]
    dataset['Age'] = dataset['Age'].astype(int)

In [16]:
train_df['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [17]:
train_df['Fare'].isna().sum()
test_df[test_df['Fare'].isna()] = test_df['Fare'].median()

In [18]:
# Replace values with according ranges
for dataset in combine:
    dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset["Fare"] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset["Fare"] <= 31), 'Fare'] = 2
    dataset.loc[dataset["Fare"] > 31, 'Fare'] = 3
    dataset["Fare"] = dataset["Fare"].astype(int)

In [19]:
X = train_df.drop("Survived", axis=1)
Y = train_df["Survived"]
X_test = test_df.drop("PassengerId", axis=1).copy()
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.3, random_state=123)

# Regresja liniowa

In [20]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

In [22]:
acc_log = logreg.score(X_train, Y_train) * 100
print("Logistic regression score:", acc_log)

Logistic regression score: 78.97271268057786


In [23]:
Y_pred = logreg.predict(X_validation)
acc_log = logreg.score(X_validation, Y_validation) * 100
print("Logistic regression score:", acc_log)

Logistic regression score: 81.34328358208955


Blad testowy modelu regresji liniowej jest duzy i wynosi 19%. Modele o malej pojemnosci, takie jak modele regresji liniowej, cechuje duzy blad systematyczny. Blad systematyczny definiuje sie jako spodziewany wynik generalizacji modelu.

# Model maszyny wektorow nosnych

In [31]:
svc_reg = SVC()
svc_reg.fit(X_train, Y_train)

In [32]:
acc_svc = svc_reg.score(X_train, Y_train) * 100
print("SVC:", acc_svc)

SVC: 71.42857142857143


# K najblizszych sasiadow

In [33]:
k_neighbours = KNeighborsClassifier()
k_neighbours.fit(X_train, Y_train)

In [35]:
acc_neigh = k_neighbours.score(X_train, Y_train) * 100
print("K-neighbours:", acc_neigh)

K-neighbours: 83.78812199036918


# Drzewo decyzyjne

In [36]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train, Y_train)

In [37]:
acc_dtc = DTC.score(X_train, Y_train) * 100
print("K-neighbours:", acc_dtc)

K-neighbours: 93.57945425361156
