In [1]:
# import libraries
import numpy as np 
import pandas as pd 
import sklearn
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
print(sklearn.__version__)

0.20.3


In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
data_process_list = [train_df, test_df]

In [3]:
# deal with Name
def re_name(x):
    find_Miss = re.findall("Miss", x)
    if find_Miss != []:
        return "Miss"
    find_Mrs = re.findall("Mrs", x)
    if find_Mrs != []:
        return "Mrs"
    find_Mr = re.findall("Mr", x)
    if find_Mr != []:
        return "Mr"
    find_Master = re.findall("Master", x)
    if find_Master != []:
        return "Master"
    return None

# function for age, transfor age into group (0, 10)(11, 20)(21, 30)(31, 40)(41, 80)
def trans_age(x):
    if x <= 10:
        return 1
    if x <= 20:
        return 2
    if x <= 30:
        return 3
    if x <= 40:
        return 4
    if x <= 80:
        return 5
    
# use transfor categorical data into numerical dataset
def dummy(x, value):
    if x == value:
        return 1
    else:
        return 0

In [4]:
delete_columns_list = []
# deal with categorical data
for df in data_process_list:
    
    # deal with name
    df["new_name"] = df["Name"].apply(lambda x: re_name(x))
    df["Mr_title"] = df["new_name"].apply(lambda x: dummy(x, "Mr"))
    df["Mrs_title"] = df["new_name"].apply(lambda x: dummy(x, "Mrs"))
    df["Miss_title"] = df["new_name"].apply(lambda x: dummy(x, "Miss"))
    df["Master_title"] = df["new_name"].apply(lambda x: dummy(x, "Master"))
    df["other_title"] = df["new_name"].isna() * 1

    # deal with embarked
    df["embarked_S"] = df["Embarked"].apply(lambda x: dummy(x, "S"))
    df["embarked_Q"] = df["Embarked"].apply(lambda x: dummy(x, "Q"))
    df["embarked_C"] = df["Embarked"].apply(lambda x: dummy(x, "C"))
    df["embarked_missing"] = df["Embarked"].isna() * 1

    # deal with Pclass
    df["pclass_1"] = df["Pclass"].apply(lambda x: dummy(x, 1))
    df["pclass_2"] = df["Pclass"].apply(lambda x: dummy(x, 2))
    df["pclass_3"] = df["Pclass"].apply(lambda x: dummy(x, 3)) 
    
    # deal with age
    df["trans_age"] = df["Age"].apply(lambda x: trans_age(x))
    df["age_0_10"] = df["trans_age"].apply(lambda x: dummy(x, 1))
    df["age_11_20"] = df["trans_age"].apply(lambda x: dummy(x, 2))
    df["age_21_30"] = df["trans_age"].apply(lambda x: dummy(x, 3))
    df["age_31_40"] = df["trans_age"].apply(lambda x: dummy(x, 4))
    df["age_41_80"] = df["trans_age"].apply(lambda x: dummy(x, 5))
    df["age_missinig"] = df["Age"].isna() * 1
    
    # deal with sex
    df["gender_male"] = df["Sex"].apply(lambda x: dummy(x, "male"))
    df["gender_female"] = df["Sex"].apply(lambda x: dummy(x, "female"))
    
    # Peers
    df["relatives"] = df["SibSp"] + df["Parch"]
    df["Alone"] = df["relatives"].apply(lambda x: dummy(x, 0))
    
delete_columns_list.append("Sex")
delete_columns_list.append("Name")
delete_columns_list.append("new_name")
delete_columns_list.append("Cabin")
delete_columns_list.append("Ticket")
delete_columns_list.append("Age")
delete_columns_list.append("trans_age")
delete_columns_list.append("Pclass")
delete_columns_list.append("Embarked")
delete_columns_list.append("SibSp")
delete_columns_list.append("Parch")

In [5]:
# delete unrelated columns
for df in data_process_list:
    for col in delete_columns_list:
        del df[col]

In [6]:
print(train_df.info())
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
PassengerId         891 non-null int64
Survived            891 non-null int64
Fare                891 non-null float64
Mr_title            891 non-null int64
Mrs_title           891 non-null int64
Miss_title          891 non-null int64
Master_title        891 non-null int64
other_title         891 non-null int64
embarked_S          891 non-null int64
embarked_Q          891 non-null int64
embarked_C          891 non-null int64
embarked_missing    891 non-null int64
pclass_1            891 non-null int64
pclass_2            891 non-null int64
pclass_3            891 non-null int64
age_0_10            891 non-null int64
age_11_20           891 non-null int64
age_21_30           891 non-null int64
age_31_40           891 non-null int64
age_41_80           891 non-null int64
age_missinig        891 non-null int64
gender_male         891 non-null int64
gender_female       891 non-null in

In [7]:
# train_df = train_df.set_index(train_df["PassengerId"])
# This method will duplicate column passengerid to index instead of moving column to index
train_df = train_df.set_index("PassengerId")
test_df = test_df.set_index("PassengerId")

In [8]:
# fill nan value
train_df = train_df.fillna(0)
test_df =test_df.fillna(0)

In [9]:
train, test = train_test_split(train_df)

In [10]:
y_train = train["Survived"]
y_test = test["Survived"]

X_train = train
del X_train["Survived"]

X_test = test
del X_test["Survived"]

In [11]:
# logistic regression
lg = LogisticRegression(C=1, solver="liblinear", max_iter=100).fit(X_train, y_train)
train_score = lg.score(X_train, y_train)
pred_score = lg.score(X_test, y_test)
print("The train_score: %s" % train_score)

The train_score: 0.8203592814371258


In [12]:
lg_result = lg.predict(test_df)
lg_sub = pd.DataFrame(lg_result, test_df.index)
lg_sub = lg_sub.rename(columns={0: "Survived"})
lg_sub.to_csv("lg_submission.csv", encoding="utf-8")
# The score is 0.78468

In [13]:
# random forest
for e in range(10, 501, 10):
    randomTree = RandomForestClassifier(n_estimators=e).fit(X_train, y_train)
    pred_score = randomTree.score(X_test, y_test)
    train_score = randomTree.score(X_train, y_train)
    print("The estimator is %s, the pred_score is %s" % (e, pred_score))
    print("The estimator is %s, the train_score is %s" % (e, train_score))

The estimator is 10, the pred_score is 0.8430493273542601
The estimator is 10, the train_score is 0.938622754491018
The estimator is 20, the pred_score is 0.852017937219731
The estimator is 20, the train_score is 0.9476047904191617
The estimator is 30, the pred_score is 0.8251121076233184
The estimator is 30, the train_score is 0.9565868263473054
The estimator is 40, the pred_score is 0.8251121076233184
The estimator is 40, the train_score is 0.9550898203592815
The estimator is 50, the pred_score is 0.8251121076233184
The estimator is 50, the train_score is 0.9565868263473054
The estimator is 60, the pred_score is 0.8251121076233184
The estimator is 60, the train_score is 0.9565868263473054
The estimator is 70, the pred_score is 0.8295964125560538
The estimator is 70, the train_score is 0.9565868263473054
The estimator is 80, the pred_score is 0.8295964125560538
The estimator is 80, the train_score is 0.9565868263473054
The estimator is 90, the pred_score is 0.8430493273542601
The esti

In [15]:
result = randomTree.predict(test_df)
sub = pd.DataFrame(result, test_df.index)
sub.to_csv("rf_submission.csv", encoding="utf-8")
# The score is 0.75598

In [16]:
# use whole train dataset to train model
whole_X_train = train_df
whole_y_train = train_df["Survived"]
del whole_X_train["Survived"]

In [17]:
# random forest
randomTree = RandomForestClassifier(n_estimators=100).fit(whole_X_train, whole_y_train)
rf_score = randomTree.score(whole_X_train, whole_y_train)
print("The score of whole train dataset is %s" % (rf_score))

The score of whole train dataset is 0.9562289562289562


In [18]:
result2 = randomTree.predict(test_df)
sub2 = pd.DataFrame(result2, test_df.index)
sub2 = sub2.rename(columns={0: "Survived"})

In [19]:
sub2.to_csv("rf_submission2.csv", encoding="utf-8")

In [21]:
# logistic regression
lg2 = LogisticRegression(C=1, solver="liblinear", max_iter=100).fit(whole_X_train, whole_y_train)
lg2_score = lg2.score(whole_X_train, whole_y_train)
print("The lg2_score: %s" % lg2_score)

The lg2_score: 0.8338945005611672


In [23]:
result3 = lg2.predict(test_df)
sub3 = pd.DataFrame(result3, test_df.index)
sub3 = sub3.rename(columns={0: "Survived"})
sub3.to_csv("lg_submission2.csv", encoding="utf-8")
# The score is 0.78468