In [160]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA, NMF
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

%matplotlib inline

In [119]:
train_df = pd.read_csv('../data/titanic/train.csv', encoding='utf-8')
test_df = pd.read_csv('../data/titanic/test.csv', encoding='utf-8')

print(train_df.shape, test_df.shape)

(891, 12) (418, 11)


In [120]:
print(train_df.head().to_string())
print(test_df.head().to_string())

   PassengerId  Survived  Pclass                                               Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0            1         0       3                            Braund, Mr. Owen Harris    male  22.0      1      0         A/5 21171   7.2500   NaN        S
1            2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1      0          PC 17599  71.2833   C85        C
2            3         1       3                             Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3            4         1       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4            5         0       3                           Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500   NaN        S
   PassengerId  Pclass                                          Name     Sex

In [121]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float

In [122]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [123]:
X_train = train_df.drop(['Survived', 'Name', 'Ticket', 'Cabin', 'Age'], axis=1)
y_train = train_df['Survived']

In [124]:
X_train = pd.get_dummies(X_train)

In [125]:
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X_train)

In [148]:
pca = PCA(n_components=6)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)
print(pca.components_.shape)
print(X_pca.shape)

(10, 10)
(891, 10)


In [None]:
plt.matshow(pca.components_, cmap='viridis')

In [150]:
clf = LogisticRegression()
scores = cross_val_score(clf, X_pca, y_train, cv=6) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))

cross-val-score 
[0.75838926 0.77852349 0.81879195 0.77027027 0.77027027 0.80405405]
cross-val-score.mean 
0.783




In [151]:
clf = LinearSVC()
scores = cross_val_score(clf, X_pca, y_train, cv=6) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))



cross-val-score 
[0.79194631 0.7852349  0.83221477 0.77027027 0.76351351 0.81081081]
cross-val-score.mean 
0.792


In [153]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
    lr_l1 = LogisticRegression(C=C, penalty="l1")
    scores = cross_val_score(lr_l1, X_pca, y_train, cv=6)
    print("C={:.3f} 인 l1 로지스틱 회귀의 테스트 정확도: {:.2f}".format(
          C, scores.mean()))

C=0.001 인 l1 로지스틱 회귀의 테스트 정확도: 0.62
C=0.010 인 l1 로지스틱 회귀의 테스트 정확도: 0.77
C=0.100 인 l1 로지스틱 회귀의 테스트 정확도: 0.79
C=1.000 인 l1 로지스틱 회귀의 테스트 정확도: 0.78
C=10.000 인 l1 로지스틱 회귀의 테스트 정확도: 0.78
C=100.000 인 l1 로지스틱 회귀의 테스트 정확도: 0.78




In [154]:
clf = DecisionTreeClassifier()
scores = cross_val_score(clf, X_pca, y_train, cv=6) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))

cross-val-score 
[0.63758389 0.63758389 0.66442953 0.68243243 0.65540541 0.76351351]
cross-val-score.mean 
0.673


In [155]:
clf = RandomForestClassifier()
scores = cross_val_score(clf, X_pca, y_train, cv=6) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))

cross-val-score 
[0.75838926 0.75838926 0.79194631 0.77702703 0.76351351 0.78378378]
cross-val-score.mean 
0.772




In [156]:
clf = GradientBoostingClassifier()
scores = cross_val_score(clf, X_pca, y_train, cv=6) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))

cross-val-score 
[0.75167785 0.75838926 0.83892617 0.77702703 0.76351351 0.80405405]
cross-val-score.mean 
0.782


In [157]:
clf = LGBMClassifier()
scores = cross_val_score(clf, X_pca, y_train, cv=6) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))

cross-val-score 
[0.75838926 0.7114094  0.68456376 0.74324324 0.75675676 0.80405405]
cross-val-score.mean 
0.743


In [158]:
clf = SVC()
scores = cross_val_score(clf, X_pca, y_train, cv=6) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))

cross-val-score 
[0.79865772 0.79194631 0.83892617 0.7972973  0.80405405 0.81756757]
cross-val-score.mean 
0.808




In [159]:
clf = MLPClassifier()
scores = cross_val_score(clf, X_pca, y_train, cv=6) # model, train, target, cross validation
print('cross-val-score \n{}'.format(scores))
print('cross-val-score.mean \n{:.3f}'.format(scores.mean()))



cross-val-score 
[0.79865772 0.75838926 0.85234899 0.79054054 0.76351351 0.80405405]
cross-val-score.mean 
0.795
