In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split 
import os
import warnings # Ignore warnings
warnings.filterwarnings('ignore')
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
np.set_printoptions(precision=2)

# Input data
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

# Any results you write to the current directory are saved as output.

In [None]:
df_train.describe()

In [None]:
#The difference between train set and test set on Age
sns.kdeplot(df_train.Age, label='df_train')
sns.kdeplot(df_test.Age, label='df_test')

In [None]:
#The difference between train set and test set on Fare
sns.kdeplot(df_train.Fare, label='df_train')
sns.kdeplot(df_test.Fare, label='df_test')

In [None]:
#The difference between train set and test set on Sex
#plt.(df_train.Sex)#, y=df_test.Sex, kind="kde"
print('female raito in df_train: {:.2f},\nfemale raito in df_test: {:.2f}'
     .format(sum(df_train.Sex=='female')/df_train.shape[0],
             sum(df_test.Sex=='female')/df_test.shape[0]))

In [None]:
print(df_train.info())
print()
print(df_test.info())

In [None]:
print(df_train.describe())
print()
print(df_test.describe())

In [None]:
#df_full = df_train.append(df_test, ignore_index=True)
#df_full.head()

In [None]:
# A heat map of correlation 
corr = df_train.corr()
_ , ax = plt.subplots(figsize=(12, 12))
cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
_ = sns.heatmap(corr, cmap = cmap, square=True, cbar_kws={ 'shrink' : .9 },
                ax=ax, annot = True, annot_kws = { 'fontsize' : 12 })

In [None]:
# Parameter----Age
#df_train['Age'] = df_train.Age.fillna(round(df_train.Age.mean()))
#df_test['Age'] = df_test.Age.fillna(round(df_test.Age.mean()))
def age_fill(full_data)：
    for dataset in full_data:
        age_avg = dataset['Age'].mean()
        age_std = dataset['Age'].std()
        age_null_count = dataset['Age'].isnull().sum()
        age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
        dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
        dataset['Age'] = dataset['Age'].astype(int)
    return dataset['Age']
df_train['Age'] = age_fill(df_train)
df_test['Age'] = age_fill(df_test)
        
# Parameter----Cabin
df_train['Cabin'] = df_train.Cabin.fillna('N')
df_train['Cabin'] = df_train.Cabin.map(lambda x: x[0])
df_test['Cabin'] = df_test.Cabin.fillna('N')
df_test['Cabin'] = df_test.Cabin.map(lambda x: x[0])

# Parameter----Name
df_train['Name'] = df_train.Name.map(lambda x: x.split(',')[1].
                                     split('.')[0].strip())
df_test['Name'] = df_test.Name.map(lambda x: x.split(',')[1].
                                     split('.')[0].strip())
## a map of more aggregated titles
Name_Dictionary = {"Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Mrs",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"}
df_train['Name'] = df_train.Name.map( Name_Dictionary )
df_test['Name'] = df_test.Name.map( Name_Dictionary )

# Parameter----PassengerId
del df_train['PassengerId']
del df_test['PassengerId']

# Parameter----Ticket
df_train['Ticket'] = df_train.Ticket.map(lambda x: x.split()[0].replace(',','').replace('.','').
                                       replace('/','').replace(' ','').strip()[:2])
df_test['Ticket'] = df_test.Ticket.map(lambda x: x.split()[0].replace(',','').replace('.','').
                                       replace('/','').replace(' ','').strip()[:2])
def nb_rid(df_full.Ticket):
    df_full.Ticket.where(isdigit())=='Nb'
    for i in range(len(df_full.Ticket)):
        if df_full.Ticket[i].isdigit():
            df_full.Ticket[i] = 'Nb'
    return df_full.Ticket
df_train['Ticket'] = nb_rid(df_train.Ticket)
df_test['Ticket'] = nb_rid(df_test.Ticket)


In [None]:
df_full = df_train.append(df_test, ignore_index=True)

In [None]:
# Parameter----Age
for i in range(df_full.shape[0]):
    if df_full.Age[i] <= 15:
        df_full.Age[i] = 'Child'
    elif 15 < df_full.Age[i] <= 35:
        df_full.Age[i] = 'Younth'
    elif 35 < df_full.Age[i] <= 50:
        df_full.Age[i] = 'Adult'
    elif 50 <= df_full.Age[i]:
        df_full.Age[i] = 'Old'
    else:
        df_full.Age[i] = 'Error'
        
# Parameter----Fare
for i in range(df_full.shape[0]):
    if df_full.Fare[i] <= 8:
        df_full.Fare[i] = 'Low'
    elif 8 < df_full.Fare[i] <= 32:
        df_full.Fare[i] = 'Normal'
    elif 32 < df_full.Fare[i] <= 67:
        df_full.Fare[i] = 'High'
    elif 67 <= df_full.Fare[i]:
        df_full.Fare[i] = 'Very high'
    else:
        df_full.Fare[i] = 'Error'

In [None]:
# Parameter----Relation
#df_full['Relation'] = df_full['Parch'] + df_full['SibSp']
#df_full.Relation


In [None]:
df_full = pd.get_dummies(df_full)

Parch = pd.get_dummies( df_full.Parch , prefix='Parch' )
Pclass = pd.get_dummies( df_full.Pclass , prefix='Pclass' )
SibSp = pd.get_dummies( df_full.SibSp , prefix='SibSp' )

df_full.drop(['Parch', 'Pclass', 'SibSp'], axis=1,inplace=True)


In [None]:
df_full = pd.concat([df_full, Parch, Pclass, SibSp], axis=1) 

In [None]:
df_train = df_full[:891]
df_test = df_full[891:]
df_train_y = df_train.pop('Survived')
X_train, X_test, y_train, y_test = train_test_split(df_train, df_train_y, random_state=0)
rf = RandomForestClassifier(n_estimators=300).fit(X_train,y_train)
      
print('Accuracy of GBDT classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print('Accuracy of GBDT classifier on test set: {:.2f}'
     .format(rf.score(X_test, y_test)))

In [None]:
test_Y = rf.predict( df_test.drop('Survived', axis=1) ).astype(int)
passenger_id = pd.Series(np.arange(892,1310,1))
test = pd.DataFrame( { 'PassengerId': passenger_id , 'Survived': test_Y } )
test.shape
test.head()
test.to_csv( 'titanic_pred.csv' , index = False )