In [465]:
import numpy as np
import pandas as pd
import csv

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

Read the data

In [511]:
with open("train.csv") as csvfile:
    titanic = pd.read_csv(csvfile)

titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Drop the NA columns ( and predict the missing values)

In [692]:
fill_na_titanic["Age"] = titanic["Age"].fillna(-1)
fill_na_titanic["Embarked"] = titanic["Embarked"].fillna("N")
fill_na_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null category
Survived       891 non-null category
Pclass         891 non-null category
Name           891 non-null category
Sex            891 non-null category
Age            891 non-null float64
SibSp          891 non-null category
Parch          891 non-null category
Ticket         891 non-null category
Fare           891 non-null float64
Cabin          204 non-null category
Embarked       891 non-null object
dtypes: category(9), float64(2), object(1)
memory usage: 158.7+ KB


Define the categorical data

In [695]:
fill_na_titanic[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Embarked"]] = fill_na_titanic[["Survived", "Pclass", "Sex", "SibSp", "Parch", "Embarked"]].astype("category")
# fill_na_titanic['Sex'] = fill_na_titanic['Sex'].cat.reorder_categories(['male', 'female'], ordered=True)
# fill_na_titanic['Sex'] = fill_na_titanic['Sex'].cat.codes

# fill_na_titanic['Embarked'] = fill_na_titanic['Embarked'].cat.reorder_categories(['C', 'Q', 'S'], ordered=True)
# fill_na_titanic['Embarked'] = fill_na_titanic['Embarked'].cat.codes

fill_na_titanic["Fare"] = fill_na_titanic["Fare"].astype("float")
fill_na_titanic["Age"] = fill_na_titanic["Age"].astype("int")
fill_na_titanic["Embarked"] = fill_na_titanic["Embarked"].astype("category")

In [696]:
fill_na_titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null category
Survived       891 non-null category
Pclass         891 non-null category
Name           891 non-null category
Sex            891 non-null category
Age            891 non-null int32
SibSp          891 non-null category
Parch          891 non-null category
Ticket         891 non-null category
Fare           891 non-null float64
Cabin          204 non-null category
Embarked       891 non-null category
dtypes: category(10), float64(1), int32(1)
memory usage: 149.3 KB


Fill Miss value in Age by Navi Baye

In [697]:
na_age_titanic = fill_na_titanic.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])
def fill_miss(df, col, na):
    no_na_df = df[df[col] != na]
    no_na_df_feature = no_na_df.drop(columns = [col])
    no_na_df_target = no_na_df[[col]]
    # no_na_age_titanic
    only_na_df_test = df[df[col] == na]
    # only_na_age_titanic
    na_df_test = only_na_df_test.drop(columns=[col])
    # na_age_test
    X = pd.get_dummies(no_na_df_feature, prefix_sep='_', drop_first=True)
    Y = no_na_df_target

    gnb = GaussianNB()
    gnb.fit(X, Y)

    test = pd.get_dummies(na_df_test, prefix_sep='_', drop_first=True)
    gnb_pred = gnb.predict(test)
    return gnb_pred

gnb_pred = fill_miss(na_age_titanic, "Age",-1)
gnb_pred

  y = column_or_1d(y, warn=True)


array([65, 55, 13, 65, 15, 61, 49, 15, 20, 65, 61, 70, 15, 37, 56, 71, 49,
       61, 61, 15, 61, 61, 61, 55, 15, 61, 65, 49, 11, 61, 61, 11, 56, 61,
        9, 11, 46, 61, 15, 65, 15, 11, 70, 61,  9, 55, 20, 15, 61, 56, 65,
       15, 61, 15, 59, 61, 71, 55, 15, 44, 30, 61, 56, 11, 44, 63, 61, 63,
       61, 65, 15, 15, 70, 13, 15, 49, 61, 65,  9, 61, 65, 59, 55, 65, 61,
       65, 63, 55, 47, 61, 63, 65, 61, 59, 65, 61, 61, 59,  9, 47, 65, 61,
       15, 55, 61, 65, 65, 65, 64, 65,  5, 61, 62, 65, 58, 65, 61, 55, 65,
       15, 20, 65, 61,  7, 55, 65, 61, 61, 61, 15, 65, 65, 61, 20, 55, 61,
       61, 15, 61, 74, 63, 59, 15, 55, 15, 49, 61, 65, 15, 59, 61, 61, 80,
       61, 71, 70, 65, 65, 65, 11, 65, 11, 71, 61, 65, 61, 15, 65, 61, 56,
       11, 49, 65, 11, 61, 61, 11])

In [679]:
only_na_age_titanic["Age"] = gnb_pred
only_na_age_titanic
df_agefill = pd.concat([no_na_age_titanic, only_na_age_titanic], axis = 0)
# only_na_age_titanic
# no_na_age_titanic

In [706]:
df_agefill.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 888
Data columns (total 8 columns):
Survived    891 non-null category
Pclass      891 non-null category
Sex         891 non-null category
Age         891 non-null int32
SibSp       891 non-null category
Parch       891 non-null category
Fare        891 non-null float64
Embarked    891 non-null category
dtypes: category(6), float64(1), int32(1)
memory usage: 23.8 KB


Fill Missing value in Emabrked by Navi Baye

In [681]:
gnb_pred = fill_miss(df_agefill, "Embarked", "N")
gnb_pred

  y = column_or_1d(y, warn=True)


array(['C', 'C'], dtype='<U1')

In [627]:
na_em["Embarked"] = gnb_pred
df_agefill = pd.concat([not_na_em, na_em], axis = 0)

Check the data frame after fill in the Missing value

In [646]:
final_df_train = df_agefill.sort_index()
final_df_train.Embarked = final_df_train.Embarked.astype("category")

str

Set model in Train data

Logistic Regressiion

In [648]:
x_train = final_df_train.iloc[:, 1:]
y_train = final_df_train.iloc[:, 0]
X = pd.get_dummies(x_train, prefix_sep='_', drop_first=True)
Y = y_train
glm = LogisticRegression()
glm.fit(X,Y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

Prediction

In [661]:
x_test_file = pd.read_csv("test.csv")
x_test_file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [698]:
x_test = x_test_file.drop(columns = ["PassengerId", "Cabin", "Name", "Ticket"])

x_test["Age"] = x_test["Age"].fillna(-1)
x_test["Fare"] = x_test["Fare"].fillna(-1)

x_test[["Pclass", "Sex", "SibSp", "Parch", "Embarked"]] = x_test[["Pclass", "Sex", "SibSp", "Parch", "Embarked"]].astype("category")
x_test["Fare"] = x_test["Fare"].astype("float")
x_test["Age"] = x_test["Age"].astype("int")

x_test.info()
# predictions = lm.predict(fill_na_titanic_test)
# pd.DataFrame({"PassengerId": x_test_file.iloc[:, 0],"Survived": predictions})
# x_test_file.iloc[:, 0]
# len(predictions)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null category
Sex         418 non-null category
Age         418 non-null int32
SibSp       418 non-null category
Parch       418 non-null category
Fare        418 non-null float64
Embarked    418 non-null category
dtypes: category(5), float64(1), int32(1)
memory usage: 8.1 KB


In [735]:
gnb_pred = fill_miss(x_test, "Age", -1)

  y = column_or_1d(y, warn=True)


In [779]:
# only_na_age_x_test
df_agefill_xtest = pd.concat([no_na_age_x_test, only_na_age_x_test], axis = 0)
df_agefill_xtest = df_agefill_xtest.sort_index()
# df_agefill_xtest

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34,0,0,7.8292,Q
1,3,female,47,1,0,7.0000,S
2,2,male,62,0,0,9.6875,Q
3,3,male,27,0,0,8.6625,S
4,3,female,22,1,1,12.2875,S
5,3,male,14,0,0,9.2250,S
6,3,female,30,0,0,7.6292,Q
7,2,male,26,1,1,29.0000,S
8,3,female,18,0,0,7.2292,C
9,3,male,21,2,0,24.1500,S


In [738]:
df_agefill_xtest["Fare"] = df_agefill_xtest["Fare"].astype("int")

gnb_pred = fill_miss(df_agefill_xtest, "Fare", -1.0)
gnb_pred

  y = column_or_1d(y, warn=True)


array([9])

In [756]:
gnb_pred[0]

9

In [791]:
# no_na_fare_x_test = df_agefill_xtest[df_agefill_xtest["Fare"] != -1]
#df_agefill_xtest.at[df_agefill_xtest["Fare"] == -1].Fare = 9

# only_na_fare_x_test
df_agefill_xtest.at[[df_agefill_xtest["Fare"] == -1],['Fare']]
# only_na_age_x_test
# df_agefill_xtest = pd.concat([no_na_fare_x_test, only_na_fare_x_test], axis = 0)
# df_agefill_xtest = df_agefill_xtest.sort_index()

ValueError: At based indexing on an integer index can only have integer indexers

In [781]:
df_agefill_xtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null category
Sex         418 non-null category
Age         418 non-null int32
SibSp       418 non-null category
Parch       418 non-null category
Fare        418 non-null float64
Embarked    418 non-null category
dtypes: category(5), float64(1), int32(1)
memory usage: 11.2 KB
