In [1]:
import pandas as pd

train_df = pd.read_csv("./input/train.csv");
test_df = pd.read_csv("./input/test.csv");

In [2]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### 数据概览

#### 将文本转换数值

将性别(male, female)转换为(1, -1):

In [4]:
sex_map = {'male':1, 'female':-1}
train_df['Sex'] = train_df['Sex'].map(sex_map)
test_df['Sex'] = test_df['Sex'].map(sex_map)

Embark映射成1，2和3

In [5]:
embarked_map = {'S':1, 'C':2, 'Q':3}
train_df['Embarked'] = train_df['Embarked'].map(embarked_map)
test_df['Embarked'] = test_df['Embarked'].map(embarked_map)

将Name转换为Family

In [6]:
# train_df['Family'] = train_df.Name.str.extract("([a-zA-Z]*),")[0]
# test_df['Family'] = test_df.Name.str.extract("([a-zA-Z]*),")[0]

# family = pd.concat([train_df['Family'], test_df['Family']])
# name_unique = family.unique()
# name_map={}
# for i in range(name_unique.shape[0]):
#     name_map[name_unique[i]] = i

# train_df['Family'] = train_df['Family'].map(name_map)
# test_df['Family'] = test_df['Family'].map(name_map)

train_df.drop(columns=['Name'], inplace=True)
test_df.drop(columns=['Name'], inplace=True)

#### 删除无用数据

Ticket被我判定为无用数据，删掉

In [7]:
train_df.drop(columns=['Ticket'],inplace=True)
test_df.drop(columns=['Ticket'],inplace=True)

#### 空数据

In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null int64
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null float64
dtypes: float64(3), int64(6), object(1)
memory usage: 69.7+ KB


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Sex            418 non-null int64
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null int64
dtypes: float64(2), int64(6), object(1)
memory usage: 29.5+ KB


cabin空数据太多，直接舍去。Embarked和Fare只有一个空数据，直接舍去。

In [10]:
train_df.drop(columns=['Cabin'],inplace=True)
test_df.drop(columns=['Cabin'],inplace=True)

train_df.dropna(subset=['Embarked'],inplace=True)

test_df.Fare.fillna(test_df.Fare.mean(),inplace=True)

Age的空数据处理

In [11]:
train_target_series = train_df.Survived

train_df.index = train_df.PassengerId
train_df.drop(columns=['PassengerId', 'Survived'], inplace=True)

test_df.index = test_df.PassengerId
test_df.drop(columns=['PassengerId'], inplace=True)

In [12]:
from sklearn import preprocessing

def fill_age_by_mean():
    total_age = pd.concat([train_df.Age, test_df.Age])
    age_mean = total_age.mean()
    return (train_df.fillna(age_mean), test_df.fillna(age_mean))

def fill_age_by_median():
    total_age = pd.concat([train_df.Age, test_df.Age])
    age_median = total_age.median()
    return (train_df.fillna(age_median), test_df.fillna(age_median))

def fill_age_by_mode():
    total_age = pd.concat([train_df.Age, test_df.Age])
    age_mode = total_age.mode()[0]
    return (train_df.fillna(age_mode), test_df.fillna(age_mode))


def fill_age_by_hot_deck():
    from sklearn import preprocessing
    train_df_1 = pd.DataFrame(preprocessing.StandardScaler().fit_transform(train_df), train_df.index, train_df.columns)
    test_df_1 = pd.DataFrame(preprocessing.StandardScaler().fit_transform(test_df), test_df.index, test_df.columns)

    train_df_1['Age'] =train_df['Age']
    test_df_1['Age'] =test_df['Age']
    
    import numpy as np
    tmp = np.linspace(0,100,10)
    train_df_1['Age']=pd.cut(train_df.Age, tmp, labels=False, retbins=True, right=False)[0]
    test_df_1['Age']=pd.cut(test_df.Age, tmp, labels=False, retbins=True, right=False)[0]

    total = pd.concat([train_df_1, test_df_1])
    total.dropna(subset=['Age'],inplace=True)
    
    age = total.Age
    total.drop(columns=['Age'],inplace=True)
    
    from sklearn.neighbors import KNeighborsClassifier
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(total, age)
    
    nan_age_df = train_df.loc[train_df.Age.isna()]
    nan_age_df.drop(columns=['Age'], inplace=True)
    age_predict_series = knn.predict(nan_age_df)
    
    for i in range(age_predict_series.shape[0]):
        train_df_1.at[nan_age_df.index[i], 'Age'] = age_predict_series[i]
    
    nan_age_df = test_df.loc[test_df.Age.isna()]
    nan_age_df.drop(columns=['Age'], inplace=True)
    age_predict_series = knn.predict(nan_age_df)
    
    for i in range(age_predict_series.shape[0]):
        test_df_1.at[nan_age_df.index[i], 'Age'] = age_predict_series[i]
    
    return (train_df_1, test_df_1)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

fill_methods = [
    fill_age_by_mean,
    fill_age_by_median,
    fill_age_by_mode,
    fill_age_by_hot_deck
]

clfs = [
    LogisticRegression(random_state=0),
    SVC(gamma='auto'),
    KNeighborsClassifier(n_neighbors=3),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0),
    GaussianNB(),
    MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
]

In [14]:
from sklearn.model_selection import train_test_split

best_clf = None
best_score = -1
best_fill_method = None
for fill_method in fill_methods:
    (train_X, test_X) = fill_method()
    X_train, X_test, y_train, y_test = train_test_split(train_X, train_target_series, test_size=0.25, random_state=0)
    
    for clf in clfs:
        if not isinstance(clf, (DecisionTreeClassifier, RandomForestClassifier)): 
            X_train = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_train), X_train.index, X_train.columns)
            X_test = pd.DataFrame(preprocessing.StandardScaler().fit_transform(X_test), X_test.index, X_test.columns)
        clf = clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        if score > best_score:
            best_score = score
            best_clf = clf
            best_fill_method = fill_method

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/index

In [15]:
print(best_fill_method)
print(best_score)
print(best_clf)

<function fill_age_by_median at 0x000000001817BD08>
0.7847533632286996
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [16]:
(train_X, test_X) = best_fill_method()

if not isinstance(clf, (DecisionTreeClassifier, RandomForestClassifier)): 
    train_X = pd.DataFrame(preprocessing.StandardScaler().fit_transform(train_X), train_X.index, train_X.columns)
    test_X = pd.DataFrame(preprocessing.StandardScaler().fit_transform(test_X), test_X.index, test_X.columns)

best_clf.fit(train_X, train_target_series)

result_df = pd.DataFrame({'PassengerId': test_df.index, 'Survived':best_clf.predict(test_X)})
result_df.to_csv('./output/result.csv', index= False)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
