泰坦尼克号数据分析

In [47]:
from sklearn.naive_bayes import BernoulliNB
import numpy as np
#Import 'tree' from scikit-learn library

%matplotlib inline

from sklearn import datasets
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.preprocessing import scale

import sklearn
print sklearn.__version__
0.18.1
In [48]:
import pandas as pd
train = pd.read_csv('/Users/xiha/github/cjc-gh-pages/data/tatanic_train.csv', sep = ",")
In [49]:
train.head()
Out[49]:
Unnamed: 0 PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [50]:
#数据的清洗
train["Age"] = train["Age"].fillna(train["Age"].median())
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
/Users/xiha/anaconda/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/xiha/anaconda/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/xiha/anaconda/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/xiha/anaconda/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/xiha/anaconda/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [51]:
#筛选新的数据集,

data_X = [[train.Pclass[i], train.Sex[i], train.Age[i], train.Fare[i]] for i in range(len(train))]#df数据集中 所有点击和恢复的数量
data_X[:3]
data_y = [train.Survived[i] for i in range(len(train))]#df数据集中 所有点击和恢复的数量
data_y[:7]
Out[51]:
[0, 1, 1, 1, 0, 0, 0]
In [79]:
#创建分类器
model = BernoulliNB()  
    
#用调用的是机器学习的方法进行训练集的划分!!!!!!!

from sklearn.cross_validation import train_test_split

Xs_train, Xs_test, y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=36)
In [80]:
model.fit(Xs_train, y_train)#训练模型!!!
print'Variance score: %.2f' % model.score(Xs_test,y_test )#使用伯努利方法对预测集分析的可行度,是否适用这个模型
Variance score: 0.82
In [81]:
#y_true, y_pred
#predicted= model.predict(Xs_test) #一个大X预测一个小y
y_true, y_pred = y_test , model.predict(Xs_test)#这俩都是y
In [ ]:
 
In [82]:
print(classification_report(y_true, y_pred))#打印分类报告
             precision    recall  f1-score   support

          0       0.83      0.86      0.85       105
          1       0.79      0.76      0.77        74

avg / total       0.81      0.82      0.82       179

In [ ]:
 
In [ ]: