In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import mca
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

import shap

In [None]:
df = pd.read_csv('train.csv')
df.head()

In [None]:
df.columns

# 大まかにデータ見る

In [None]:
print(df.info())
df.sample(3)

In [None]:
# 使いそうなデータだけ取り出す。
new_df = df[['Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

In [None]:
new_df.isnull().all()

In [None]:
new_df.isnull().sum()

# nanを埋める

In [None]:
new_df['Age']=new_df['Age'].fillna(new_df['Age'].median())

In [None]:
new_df['Embarked'].value_counts()

In [None]:
new_df['Embarked'] = new_df['Embarked'].fillna('S')

In [None]:
dummy_df = pd.get_dummies(new_df)

In [None]:
def preprocessing(df):
    new_df = df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
    new_df['Age']=new_df['Age'].fillna(new_df['Age'].median())
    new_df['Embarked'] = new_df['Embarked'].fillna('S')
    print(new_df.isnull().sum())
    dummy_df = pd.get_dummies(new_df)
    return dummy_df

# データの描写
## MCA: Multiple Correspondence Analysis (mca)
主成分分析の質的変数版  

In [None]:
mca_counts = mca.MCA(dummy_df, benzecri=False)

rows = mca_counts.fs_r(N=2)
cols = mca_counts.fs_c(N=2)
"""
plt.scatter(rows[:,0], rows[:,1], c='b',marker='o')
labels = dummy_df.index
for label,x,y in zip(labels,rows[:,0],rows[:,1]):
    plt.annotate(label,xy = (x, y))
"""
plt.figure(figsize=(8,8))
plt.axvline(x=0, linestyle='--', color='k')
plt.axhline(y=0, linestyle='--', color='k')

plt.scatter(cols[:,0], cols[:,1], c='r',marker='x')
labels = dummy_df.columns
for label,x,y in zip(labels,cols[:,0],cols[:,1]):
    plt.annotate(label,xy = (x, y))
plt.show()

# 学習

In [None]:
train_df = pd.read_csv('train.csv')
train_df = preprocessing(train_df)

In [None]:
test_df = pd.read_csv('test.csv')
test_df = test_df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
test_df['Age']= test_df['Age'].fillna(test_df['Age'].median())
test_df['Embarked'] = test_df['Embarked'].fillna('S')
print(test_df.isnull().sum())

In [None]:
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())

In [None]:
test_X = pd.get_dummies(test_df)

In [None]:
X_df = dummy_df.drop(columns=['Survived'])
y_s = dummy_df['Survived']

In [None]:
X_train_df, X_test_df, y_train_s, y_test_s = train_test_split(
    X_df, y_s, test_size=0.2, random_state=0, stratify=y_s
    )

## random forestを使用してみた

In [None]:
rf = RandomForestClassifier(max_depth=40, n_estimators=50, random_state=42)
rf.fit(X_train_df,y_train_s)

In [None]:
y_pred = rf.predict(X_test_df)
accuracy_score(y_pred=y_pred,y_true=y_test_s)

In [None]:
svc = SVC()
svc.fit(X_train_df,y_train_s)
y_pred = svc.predict(X_test_df)
accuracy_score(y_pred=y_pred,y_true=y_test_s)

### 各特長量のimportance

## random forest

In [None]:
importance = {}

for n, v in zip(X_train_df, rf.feature_importances_):
    importance[n] = v
importance = sorted(importance.items(), key=lambda x:x[1], reverse=True)
for i in importance:
    print(i)

In [None]:
shap.initjs()

# use Kernel SHAP to explain test set predictions
explainer = shap.KernelExplainer(rf.predict_proba, X_train_df, link="logit")
shap_values = explainer.shap_values(X_test_df, nsamples=100)

# plot the SHAP values for the Setosa output of the first instance
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")