In [1]:
import os
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin #전처리 파이프라인 구현

TITANIC_PATH = os.path.join("datasets", "titanic")

def load_titanic_data(filename, titanic_path=TITANIC_PATH):#csv 객체 반환
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

if __name__=="__main__":
    train_data = load_titanic_data("train.csv")
    test_data = load_titanic_data("test.csv")
    y_test = load_titanic_data("gender_submission.csv")
    #train_data.head() #데이터 속성
    #train_data.info() #레이블 null값 보기
    #train_data.describe()#통계치
    #train_data["Survived"].value_counts() #target 확인
    #print(train_data["Pclass"].value_counts()) #범주형 데이터 확인
    #print(train_data["Sex"].value_counts())
    #print(train_data["Embarked"].value_counts())

In [2]:
train_data.head() #데이터 속성

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.info() #레이블 null값 보기

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
train_data.describe()#통계치

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
train_data["Survived"].value_counts() #target 확인

0    549
1    342
Name: Survived, dtype: int64

In [6]:
print(train_data["Pclass"].value_counts()) #범주형 데이터 확인
print(train_data["Sex"].value_counts())
print(train_data["Embarked"].value_counts())

3    491
1    216
2    184
Name: Pclass, dtype: int64
male      577
female    314
Name: Sex, dtype: int64
S    644
C    168
Q     77
Name: Embarked, dtype: int64


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin #전처리 파이프라인 구현

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
    ("imputer", SimpleImputer(strategy="median")),
])

print(num_pipeline.fit_transform(train_data))

[[22.      1.      0.      7.25  ]
 [38.      1.      0.     71.2833]
 [26.      0.      0.      7.925 ]
 ...
 [28.      1.      2.     23.45  ]
 [26.      0.      0.     30.    ]
 [32.      0.      0.      7.75  ]]


In [9]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder


class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X , y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], 
                                        index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)
    
cat_pipeline = Pipeline([
    ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
    ("imputer", MostFrequentImputer()),
    ("cat_encoder", OneHotEncoder(sparse=False)),
])

print(cat_pipeline.fit_transform(train_data))

[[0. 0. 1. ... 0. 0. 1.]
 [1. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 1.]
 ...
 [0. 0. 1. ... 0. 0. 1.]
 [1. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 1. 0.]]


In [10]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X , y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X], 
                                        index=X.columns)
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [11]:
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [12]:
X_train = preprocess_pipeline.fit_transform(train_data)
y_train = train_data["Survived"]
print(X_train)
print(y_train)


[[22.  1.  0. ...  0.  0.  1.]
 [38.  1.  0. ...  1.  0.  0.]
 [26.  0.  0. ...  0.  0.  1.]
 ...
 [28.  1.  2. ...  0.  0.  1.]
 [26.  0.  0. ...  1.  0.  0.]
 [32.  0.  0. ...  0.  1.  0.]]
0      0
1      1
2      1
3      1
4      0
5      0
6      0
7      0
8      1
9      1
10     1
11     1
12     0
13     0
14     0
15     1
16     0
17     1
18     0
19     1
20     0
21     1
22     1
23     1
24     0
25     1
26     0
27     0
28     1
29     0
      ..
861    0
862    1
863    0
864    0
865    1
866    1
867    0
868    0
869    1
870    0
871    1
872    0
873    0
874    1
875    1
876    0
877    0
878    0
879    1
880    1
881    0
882    0
883    0
884    0
885    0
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64


In [15]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
print(svm_scores.mean())

X_test = preprocess_pipeline.transform(test_data)
y_pred = svm_clf.predict(X_test)

y_test = y_test["Survived"].values
print((y_test == y_pred).sum() / len(y_test))

0.7365250822835092
0.722488038277512


In [20]:


forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
forest_clf.fit(X_train, y_train)

forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
print(forest_scores.mean())

y_pred = forest_clf.predict(X_test)
print((y_test == y_pred).sum() / len(y_test))

0.8115690614005221
0.8253588516746412


In [14]:
svm_clf = SVC(gamma='auto')
svm_clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
from sklearn.neighbors import KNeighborsClassifier #-최근접 이웃

knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
knn_scores = cross_val_score(knn,X_train,y_train,cv=10)
print(knn_scores.mean())
Y_pred = knn.predict(X_test)
print((y_test == Y_pred).sum() / len(y_test))

0.729707467937805
0.6626794258373205


In [18]:
from sklearn.model_selection import KFold #K-Fole 교차검증
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

In [19]:
clf = RandomForestClassifier(n_estimators=13)
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=13, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
scoring = 'accuracy'
score = cross_val_score(clf, X_train, y_train,cv=k_fold, n_jobs=1, scoring=scoring)
print(score.mean())
Y_pred = knn.predict(X_test)
print((y_test == Y_pred).sum() / len(y_test))

0.800187265917603
0.6626794258373205


80.36

In [22]:
#결정트리모델
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(max_depth = 3)
decision_tree.fit(X_train, y_train)
decision_tree_scores = cross_val_score(decision_tree,X_train,y_train,cv=10)
print(decision_tree_scores.mean())
Y_pred = decision_tree.predict(X_test)
print((y_test == Y_pred).sum() / len(y_test))

0.8147891839745773
0.9665071770334929


In [23]:
from sklearn.model_selection import GridSearchCV
param_rfc = {'n_estimators':[100, 200, 300, 400, 500],
         'criterion':['entropy'], 'max_depth':[4,5,6]}
     
rfc=GridSearchCV(RandomForestClassifier(), param_grid=param_rfc)
rfc.fit(X_train, y_train)
print(rfc.best_estimator_)
print(rfc.best_params_)
print(rfc.best_score_)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
{'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 400}
0.8260381593714927


In [48]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score , recall_score
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler = StandardScaler()
sgd_clf = SGDClassifier(max_iter=1, random_state=42)
sgd_clf.fit(X_train,y_train)
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train,cv=3)
y_train_pred_no_cv = sgd_clf.predict(X_train)

print(cross_val_score(sgd_clf,X_train_scaled,y_train,cv=3,scoring="accuracy"))
#print(confusion_matrix(y_train, y_train_pred))
#print(confusion_matrix(y_train, y_train_pred_no_cv))
#print(precision_score(y_train, y_train_pred))
#print(recall_score(y_train,y_train_pred))

[0.63973064 0.61952862 0.64309764]


In [45]:
some_digit = X_train[700]
some_digit

array([ 18.   ,   1.   ,   0.   , 227.525,   1.   ,   0.   ,   0.   ,
         1.   ,   0.   ,   1.   ,   0.   ,   0.   ])