In [23]:
!ls ~/my_drive/kaggle/

predict.csv  results1.csv  results2.csv  test.csv  train.csv


In [24]:
import math

from IPython import display
from matplotlib import cm
from matplotlib import gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import tensorflow as tf
from tensorflow.python.data import Dataset

tf.logging.set_verbosity(tf.logging.ERROR)
pd.options.display.max_rows = 20
pd.options.display.float_format = '{:.1f}'.format

titanic_test = pd.read_csv('/content/my_drive/kaggle/test.csv')
titanic_train = pd.read_csv('/content/my_drive/kaggle/train.csv')
titanic_train = titanic_train.reindex(
    np.random.permutation(titanic_train.index))
#titanic_test = titanic_test.reindex(np.random.permutation(titanic_test.index))
titanic_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
51,52,0,3,"Nosworthy, Mr. Richard Cater",male,21.0,0,0,A/4. 39886,7.8,,S
299,300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,247.5,B58 B60,C
867,868,0,1,"Roebling, Mr. Washington Augustus II",male,31.0,0,0,PC 17590,50.5,A24,S
536,537,0,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.6,B38,S
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2,,C
663,664,0,3,"Coleff, Mr. Peju",male,36.0,0,0,349210,7.5,,S
295,296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7,,C
225,226,0,3,"Berglund, Mr. Karl Ivar Sven",male,22.0,0,0,PP 4348,9.3,,S
799,800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.1,,S
52,53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49.0,1,0,PC 17572,76.7,D33,C


In [82]:
# combine test and train datasets to get the mean values of the total dataset
all_data = pd.concat([titanic_train, titanic_test])


mean_male = all_data[all_data["Sex"]=="male"]["Age"].mean()
mean_female = all_data[all_data["Sex"]=="female"]["Age"].mean()
print ("female mean age: %1.0f" %mean_female )
print ("male mean age: %1.0f" %mean_male )

 
mean_fare = all_data["Fare"].mean()
all_data["Fare"] = all_data["Fare"].fillna(mean_fare)
titanic_test["Fare"] = titanic_test["Fare"].fillna(mean_fare)

titanic_train["Embarked"] = titanic_train["Embarked"].fillna("S")
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_train.loc[ (titanic_train["Sex"]=="male") & (titanic_train["Age"].isnull()), "Age"] = mean_male
titanic_train.loc[ (titanic_train["Sex"]=="female") & (titanic_train["Age"].isnull()), "Age"] = mean_female

titanic_test.loc[ (titanic_test["Sex"]=="male") & (titanic_test["Age"].isnull()), "Age"] = mean_male
titanic_test.loc[ (titanic_test["Sex"]=="female") & (titanic_test["Age"].isnull()), "Age"] = mean_female

def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'
      
all_data["Title"]=all_data["Name"].apply(get_title)
all_data.groupby(['Title']).size()
titanic_test["Title"]=titanic_test["Name"].apply(get_title)
titanic_train["Title"]=titanic_train["Name"].apply(get_title)
all_data.groupby(['Title']).size()

female mean age: 29
male mean age: 31


Title
Capt              1
Col               4
Don               1
Dona              1
Dr                8
Jonkheer          1
Lady              1
Major             2
Master           61
Miss            260
Mlle              2
Mme               1
Mr              757
Mrs             197
Ms                2
Rev               8
Sir               1
the Countess      1
dtype: int64

In [0]:

def build_feature_columns():
  Age = tf.feature_column.numeric_column("Age")
  Fare = tf.feature_column.numeric_column("Fare")
  Sex = tf.feature_column.categorical_column_with_vocabulary_list("Sex",["female", "male"])
  Pclass = tf.feature_column.categorical_column_with_vocabulary_list("Pclass",[1,2,3])
  Title = tf.feature_column.categorical_column_with_hash_bucket("Title", 19)
  #Embarked =tf.feature_column.categorical_column_with_vocabulary_list("Embarked",["Q", "S","C"])
  age_bucket = tf.feature_column.bucketized_column(Age,[6,12,21,30,60])
  #Pclass_x_Sex = tf.feature_column.crossed_column([Pclass, Sex], 10)
  
  processed_features = [Sex, Pclass, Title, Fare]
  return processed_features
  

In [94]:
# sampling 80% for train data
train_set = titanic_train.sample(frac=0.8, replace=False, random_state=777)
# the other 20% is reserverd for cross validation
cv_set = titanic_train.loc[ set(titanic_train.index) - set(train_set.index)]

cv_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 178 entries, 1 to 509
Data columns (total 13 columns):
PassengerId    178 non-null int64
Survived       178 non-null int64
Pclass         178 non-null int64
Name           178 non-null object
Sex            178 non-null object
Age            178 non-null float64
SibSp          178 non-null int64
Parch          178 non-null int64
Ticket         178 non-null object
Fare           178 non-null float64
Cabin          46 non-null object
Embarked       178 non-null object
Title          178 non-null object
dtypes: float64(2), int64(5), object(6)
memory usage: 19.5+ KB


In [0]:
estimator = tf.estimator.LinearClassifier(
    feature_columns=build_feature_columns())

In [0]:
train_input_fn = tf.estimator.inputs.pandas_input_fn(
      #x=train_set.drop('Survived', axis=1),
      x=train_set[["Sex","Pclass","Age", "Fare", "Title"]],
      y=train_set.Survived,
      num_epochs=None, 
      shuffle=True,
      target_column='target',
)

cv_input_fn = tf.estimator.inputs.pandas_input_fn(
      #x=cv_set.drop('Survived', axis=1),
      x=cv_set[["Sex","Pclass", "Age","Fare", "Title"]],
      y=cv_set.Survived,
      num_epochs=1, 
      shuffle=False   
)

In [0]:
model = estimator.train(input_fn=train_input_fn, steps=5000)

In [98]:
scores = estimator.evaluate(input_fn=cv_input_fn)
print("\nTest Accuracy: {0:f}\n".format(scores['accuracy']))


Test Accuracy: 0.797753



In [32]:
titanic_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.7,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.7,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.3,,S


In [0]:



prediction_input_fn = tf.estimator.inputs.pandas_input_fn(
      #x=train_set.drop('Survived', axis=1),
      x=titanic_test[["Sex","Pclass", "Title", "Age", "Fare"]],
      y=None,
      num_epochs=1,
      shuffle=False,
      target_column='target',
)
test_predictions = list(model.predict(input_fn=prediction_input_fn))
test_predictions = np.array([item['class_ids'][0] for item in test_predictions])

result = pd.DataFrame()
result['PassengerId'] = titanic_test['PassengerId']
result['Survived'] = pd.Series(test_predictions)
result.to_csv(path_or_buf="~/my_drive/kaggle/predict4.csv", index=False)