In [86]:
import numpy as np
import pandas as pd
import csv

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

Read the data

In [87]:
with open("train.csv") as csvfile:
    titanic = pd.read_csv(csvfile)

titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Drop the NA columns ( and predict the missing values)

In [88]:
fill_na_titanic = titanic
fill_na_titanic["Age"] = titanic["Age"].fillna(-1)
fill_na_titanic["Embarked"] = titanic["Embarked"].fillna("N")
fill_na_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


Define the categorical data

In [89]:
fill_na_titanic[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Embarked"]] = fill_na_titanic[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Embarked"]].astype("category")
# fill_na_titanic['Sex'] = fill_na_titanic['Sex'].cat.reorder_categories(['male', 'female'], ordered=True)
# fill_na_titanic['Sex'] = fill_na_titanic['Sex'].cat.codes

# fill_na_titanic['Embarked'] = fill_na_titanic['Embarked'].cat.reorder_categories(['C', 'Q', 'S'], ordered=True)
# fill_na_titanic['Embarked'] = fill_na_titanic['Embarked'].cat.codes

fill_na_titanic["Fare"] = fill_na_titanic["Fare"].astype("float")
fill_na_titanic["Age"] = fill_na_titanic["Age"].astype("int")
fill_na_titanic["Embarked"] = fill_na_titanic["Embarked"].astype("category")

In [90]:
fill_na_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null category
Pclass         891 non-null category
Name           891 non-null object
Sex            891 non-null category
Age            891 non-null int32
SibSp          891 non-null category
Parch          891 non-null category
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null category
dtypes: category(6), float64(1), int32(1), int64(1), object(3)
memory usage: 44.8+ KB


Fill Miss value in Age by Navi Baye

In [91]:
na_age_titanic = fill_na_titanic.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
def fill_miss(df, col, na):
    no_na_df = df[df[col] != na]
    no_na_df_feature = no_na_df.drop(columns = [col])
    no_na_df_target = no_na_df[[col]]
    # no_na_age_titanic
    only_na_df_test = df[df[col] == na]
    # only_na_age_titanic
    na_df_test = only_na_df_test.drop(columns=[col])
    # na_age_test
    X = pd.get_dummies(no_na_df_feature, prefix_sep='_', drop_first=True)
    Y = no_na_df_target

    gnb = GaussianNB()
    gnb.fit(X, Y)

    test = pd.get_dummies(na_df_test, prefix_sep='_', drop_first=True)
    gnb_pred = gnb.predict(test)
    return gnb_pred

gnb_pred = fill_miss(na_age_titanic, "Age",-1)
gnb_pred

  y = column_or_1d(y, warn=True)


array([65, 55, 13, 65, 15, 61, 49, 15, 20, 65, 61, 70, 15, 37, 56, 71, 49,
       61, 61, 15, 61, 61, 61, 55, 15, 61, 65, 49, 11, 61, 61, 11, 56, 61,
        9, 11, 46, 61, 15, 65, 15, 11, 70, 61,  9, 55, 20, 15, 61, 56, 65,
       15, 61, 15, 59, 61, 71, 55, 15, 44, 30, 61, 56, 11, 44, 63, 61, 63,
       61, 65, 15, 15, 70, 13, 15, 49, 61, 65,  9, 61, 65, 59, 55, 65, 61,
       65, 63, 55, 47, 61, 63, 65, 61, 59, 65, 61, 61, 59,  9, 47, 65, 61,
       15, 55, 61, 65, 65, 65, 64, 65,  5, 61, 62, 65, 58, 65, 61, 55, 65,
       15, 20, 65, 61,  7, 55, 65, 61, 61, 61, 15, 65, 65, 61, 20, 55, 61,
       61, 15, 61, 74, 63, 59, 15, 55, 15, 49, 61, 65, 15, 59, 61, 61, 80,
       61, 71, 70, 65, 65, 65, 11, 65, 11, 71, 61, 65, 61, 15, 65, 61, 56,
       11, 49, 65, 11, 61, 61, 11])

In [92]:
na_age_titanic.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S
5,0,3,male,-1,0,0,8.4583,Q
6,0,1,male,54,0,0,51.8625,S
7,0,3,male,2,3,1,21.075,S
8,1,3,female,27,0,2,11.1333,S
9,1,2,female,14,1,0,30.0708,C


In [93]:
only_na_age_titanic = na_age_titanic[na_age_titanic["Age"] == -1]
no_na_age_titanic = na_age_titanic[na_age_titanic["Age"] != -1]
only_na_age_titanic["Age"] = gnb_pred
# only_na_age_titanic.head(10)
df_agefill = pd.concat([no_na_age_titanic, only_na_age_titanic], axis = 0)
# only_na_age_titanic
# no_na_age_titanic

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [94]:
df_agefill.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 888
Data columns (total 8 columns):
Survived    891 non-null category
Pclass      891 non-null category
Sex         891 non-null category
Age         891 non-null int32
SibSp       891 non-null category
Parch       891 non-null category
Fare        891 non-null float64
Embarked    891 non-null category
dtypes: category(6), float64(1), int32(1)
memory usage: 23.8 KB


Fill Missing value in Emabrked by Navi Baye

In [95]:
gnb_pred = fill_miss(df_agefill, "Embarked", "N")
gnb_pred

  y = column_or_1d(y, warn=True)


array(['C', 'C'], dtype='<U1')

In [96]:
na_em = df_agefill[df_agefill["Embarked"] == "N"]
not_na_em = df_agefill[df_agefill["Embarked"] != "N"]
na_em["Embarked"] = gnb_pred
df_agefill = pd.concat([not_na_em, na_em], axis = 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Check the data frame after fill in the Missing value

In [97]:
final_df_train = df_agefill.sort_index()
final_df_train.Embarked = final_df_train.Embarked.astype("category")
final_df_train.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22,1,0,7.25,S
1,1,1,female,38,1,0,71.2833,C
2,1,3,female,26,0,0,7.925,S
3,1,1,female,35,1,0,53.1,S
4,0,3,male,35,0,0,8.05,S
5,0,3,male,65,0,0,8.4583,Q
6,0,1,male,54,0,0,51.8625,S
7,0,3,male,2,3,1,21.075,S
8,1,3,female,27,0,2,11.1333,S
9,1,2,female,14,1,0,30.0708,C


Set model in Train data

Logistic Regressiion

In [98]:
x_train = final_df_train.iloc[:, 1:]
y_train = final_df_train.iloc[:, 0]
X = pd.get_dummies(x_train, prefix_sep='_', drop_first=True)
Y = y_train
glm = LogisticRegression()
glm.fit(X,Y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

Prediction

In [99]:
x_test_file = pd.read_csv("test.csv")
x_test_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [100]:
x_test = x_test_file.drop(columns = ["PassengerId", "Cabin", "Name", "Ticket"])

x_test["Age"] = x_test["Age"].fillna(-1)
x_test["Fare"] = x_test["Fare"].fillna(-1)

x_test[["Pclass", "Sex", "SibSp", "Parch", "Embarked"]] = x_test[["Pclass", "Sex", "SibSp", "Parch", "Embarked"]].astype("category")
x_test["Fare"] = x_test["Fare"].astype("float")
x_test["Age"] = x_test["Age"].astype("int")

x_test.info()
# predictions = lm.predict(fill_na_titanic_test)
# pd.DataFrame({"PassengerId": x_test_file.iloc[:, 0],"Survived": predictions})
# x_test_file.iloc[:, 0]
# len(predictions)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null category
Sex         418 non-null category
Age         418 non-null int32
SibSp       418 non-null category
Parch       418 non-null category
Fare        418 non-null float64
Embarked    418 non-null category
dtypes: category(5), float64(1), int32(1)
memory usage: 8.1 KB


In [101]:
gnb_pred = fill_miss(x_test, "Age", -1)

  y = column_or_1d(y, warn=True)


In [102]:
no_na_age_x_test = x_test[x_test["Age"] == -1]
only_na_age_x_test = x_test[x_test["Age"] != -1]
# only_na_age_x_test
df_agefill_xtest = pd.concat([no_na_age_x_test, only_na_age_x_test], axis = 0)
df_agefill_xtest = df_agefill_xtest.sort_index()
# df_agefill_xtest

In [103]:
df_agefill_xtest["Fare"] = df_agefill_xtest["Fare"].astype("int")

gnb_pred = fill_miss(df_agefill_xtest, "Fare", -1.0)
gnb_pred

  y = column_or_1d(y, warn=True)


array([9])

In [104]:
gnb_pred[0]

9

In [110]:
no_na_fare_x_test = df_agefill_xtest[df_agefill_xtest["Fare"] != -1]
# df_agefill_xtest.at[df_agefill_xtest["Fare"] == -1].Fare = 9
only_na_fare_x_test = df_agefill_xtest[df_agefill_xtest["Fare"] == -1]

# df_agefill_xtest.at[[df_agefill_xtest["Fare"] == -1],['Fare']]
only_na_fare_x_test["Fare"] = gnb_pred
only_na_fare_x_test
df_farefill_xtest = pd.concat([no_na_fare_x_test, only_na_fare_x_test], axis = 0)
df_farefill_xtest = df_agefill_xtest.sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False


In [107]:
df_farefill_xtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null category
Sex         418 non-null category
Age         418 non-null int32
SibSp       418 non-null category
Parch       418 non-null category
Fare        418 non-null int32
Embarked    418 non-null category
dtypes: category(5), int32(2)
memory usage: 9.6 KB


In [123]:
test.info()
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 20 columns):
Age           418 non-null int32
Fare          418 non-null int32
Pclass_2      418 non-null uint8
Pclass_3      418 non-null uint8
Sex_male      418 non-null uint8
SibSp_1       418 non-null uint8
SibSp_2       418 non-null uint8
SibSp_3       418 non-null uint8
SibSp_4       418 non-null uint8
SibSp_5       418 non-null uint8
SibSp_8       418 non-null uint8
Parch_1       418 non-null uint8
Parch_2       418 non-null uint8
Parch_3       418 non-null uint8
Parch_4       418 non-null uint8
Parch_5       418 non-null uint8
Parch_6       418 non-null uint8
Parch_9       418 non-null uint8
Embarked_Q    418 non-null uint8
Embarked_S    418 non-null uint8
dtypes: int32(2), uint8(18)
memory usage: 13.9 KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 829
Data columns (total 19 columns):
Age           891 non-null int32
Fare          891 non-null float64
Pclass_2    

In [126]:
gnb = GaussianNB()

X = df_agefill.drop(columns = ["Survived"])
X = pd.get_dummies(X, prefix_sep='_', drop_first=True)
X["Parch_9"] = 0


Y = df_agefill[["Survived"]]
gnb.fit(X, Y)

test = pd.get_dummies(df_farefill_xtest, prefix_sep='_', drop_first=True)
# test
gnb_pred = gnb.predict(test)
gnb_pred

  y = column_or_1d(y, warn=True)


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

Produce the final result DF

In [135]:
x_test_file
df_pred = pd.DataFrame(gnb_pred, columns =  ["Survived"])
result = pd.concat([x_test_file["PassengerId"],df_pred ], axis=1, sort=False)
result.head(10)
result.info()

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,1
2,894,1
3,895,1
4,896,1
5,897,1
6,898,1
7,899,1
8,900,1
9,901,1


In [136]:
result.to_csv('titanic_result.csv')