In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Import supplementary visualizations code visuals.py
# import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [77]:
# Load the dataset
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Print the first few entries of the RMS Titanic data
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 90.5+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [78]:
output = data['Survived']
drop_cols = ['PassengerId', 'Name', 'Ticket', 'Cabin']
data_red = data.drop(drop_cols + ['Survived'], axis = 1)
test_red = test_data.drop(drop_cols, axis = 1)
data_red.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22,1,0,7.25,S
1,1,female,38,1,0,71.2833,C
2,3,female,26,0,0,7.925,S
3,1,female,35,1,0,53.1,S
4,3,male,35,0,0,8.05,S


In [79]:
print data['Embarked'].unique()
print data['Sex'].unique()

['S' 'C' 'Q' nan]
['male' 'female']


In [80]:
def clean_data(data):
    # Age and embark as nan value
    # replace the nan in age by the mean of age
    # replace the embarked nan by a numerical value of 3.
    age_mean = data['Age'].mean()
    data['Age'][np.isnan(data['Age'])] = age_mean
    data['Sex'] = (data['Sex'] == 'male').astype('int')
    data['Embarked'] = data['Embarked'].map({'S' : 0, 'C': 1, 'Q': 2, np.nan : 3}).astype('int')
clean_data(data_red)
clean_data(test_red)
data_red.info()
data_red.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Sex         891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null int64
dtypes: float64(2), int64(5)
memory usage: 55.7 KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22,1,0,7.25,0
1,1,0,38,1,0,71.2833,1
2,3,0,26,0,0,7.925,0
3,1,0,35,1,0,53.1,0
4,3,1,35,0,0,8.05,0


In [81]:
# Import train_test_split
from sklearn.cross_validation import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_red, 
                                                    output, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

Training set has 712 samples.
Testing set has 179 samples.


In [82]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score, accuracy_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import ShuffleSplit

clf = AdaBoostClassifier()
parameters = {'n_estimators':[50, 100, 200], 'learning_rate':[0.1, 0.5, 1]}
scorer = make_scorer(fbeta_score, beta = 1)
cv_sets = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)

grid_obj = GridSearchCV(clf, param_grid=parameters, scoring=scorer, cv = cv_sets)

grid_fit = grid_obj.fit(X_train, y_train)

best_clf = grid_fit.best_estimator_

predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)

# Report the before-and-afterscores
print "Unoptimized model\n------"
print "Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions))
print "F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 1))
print "\nOptimized Model\n------"
print "Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions))
print "Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 1))

Unoptimized model
------
Accuracy score on testing data: 0.8212
F-score on testing data: 0.7647

Optimized Model
------
Final accuracy score on the testing data: 0.8101
Final F-score on the testing data: 0.7536


In [83]:
grid_fit.best_params_

{'learning_rate': 0.5, 'n_estimators': 100}

In [84]:
print test_red.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Sex         418 non-null int64
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Embarked    418 non-null int64
dtypes: float64(2), int64(5)
memory usage: 26.1 KB
None


In [85]:
# nan value in Fare column, replace it by average of Fare
fare_mean = test_red['Fare'].mean()
test_red['Fare'][np.isnan(test_red['Fare'])] = fare_mean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [89]:
sample = pd.read_csv('gender_submission.csv')
sample.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [91]:
pred = clf.predict(test_red)
PassengerId = np.array(test_data['PassengerId']).astype(int)
my_solution = pd.DataFrame(zip(PassengerId, pred), columns = ["PassengerId", "Survived"])
my_solution.head()
my_solution.to_csv('my_solution_adaboost.csv')