# 1.Googleドライブのマウント

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 2. モジュールのインポートとデータ準備

In [2]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [3]:
# titanic data csvファイルの読み込み
titanic_df = pd.read_csv('/content/drive/My Drive/study_ai_ml_google/data/titanic_train.csv')

In [4]:
# データセットの先頭５行表示
titanic_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# データ構造を確認
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
#予測に不要と判断するカラムを削除
titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin','Embarked'], axis=1, inplace=True)

In [7]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [8]:
# AGEがnanのデータは削除
titanic_df =  titanic_df.dropna(how='any')

In [9]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Sex       714 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     714 non-null    int64  
 5   Parch     714 non-null    int64  
 6   Fare      714 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 44.6+ KB


In [10]:
# 性別カラムを追加　　0：女性　　1：男性
titanic_df['Gender'] = titanic_df['Sex'].map({'female': 0, 'male': 1}).astype(int)

In [11]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  714 non-null    int64  
 1   Pclass    714 non-null    int64  
 2   Sex       714 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     714 non-null    int64  
 5   Parch     714 non-null    int64  
 6   Fare      714 non-null    float64
 7   Gender    714 non-null    int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 50.2+ KB


In [12]:
titanic_df.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Gender
count,714.0,714.0,714.0,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,29.699118,0.512605,0.431373,34.694514,0.634454
std,0.49146,0.83825,14.526497,0.929783,0.853289,52.91893,0.481921
min,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,1.0,20.125,0.0,0.0,8.05,0.0
50%,0.0,2.0,28.0,0.0,0.0,15.7417,1.0
75%,1.0,3.0,38.0,1.0,1.0,33.375,1.0
max,1.0,3.0,80.0,5.0,6.0,512.3292,1.0


In [13]:
# 'Survived'との相関係数を出力
titanic_df.corr()['Survived']   # Gender の相関が他と比べ非常に高い

Survived    1.000000
Pclass     -0.359653
Age        -0.077221
SibSp      -0.017358
Parch       0.093317
Fare        0.268189
Gender     -0.538826
Name: Survived, dtype: float64

# 3. [課題] 年齢と性別で生死を判別

In [14]:
#生死フラグのみのリストを作成
label =  titanic_df.loc[:,["Survived"]].values

In [15]:
#性別、年齢のリストを作成
data = titanic_df.loc[:, ["Age",'Gender']].values

In [16]:
# 学習データと検証データの分割 学習用:70% 検証用:30%
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size = 0.3, random_state = 666)

In [17]:
# 学習データと検証データの件数を表示
print('X_train.shape: ', X_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

X_train.shape:  (499, 2)
X_test.shape:  (215, 2)
y_train.shape:  (499, 1)
y_test.shape:  (215, 1)


In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [19]:
# ロジスティック回帰モデルの生成
model=LogisticRegression(random_state=0, verbose=1)

In [20]:
# 学習用データで学習し、モデルを作成
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False)

In [21]:
# 作成したモデルから予測（学習用、検証用モデル使用）
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [22]:
# 学習用、検証用データに関して平均二乗誤差を出力
print('MSE Train : %.3f, Test : %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))

# 学習用、検証用データに関してR^2を出力
print('R^2 Train : %.3f, Test : %.3f' % (model.score(X_train, y_train), model.score(X_test, y_test)))

MSE Train : 0.214, Test : 0.233
R^2 Train : 0.786, Test : 0.767


In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [24]:
# 性能評価
print('confusion matrix = \n', confusion_matrix(y_true=y_test, y_pred=y_test_pred))
print('accuracy = ', accuracy_score(y_true=y_test, y_pred=y_test_pred))
print('precision = ', precision_score(y_true=y_test, y_pred=y_test_pred))
print('recall = ', recall_score(y_true=y_test, y_pred=y_test_pred))
print('f1 score = ', f1_score(y_true=y_test, y_pred=y_test_pred))

confusion matrix = 
 [[120  23]
 [ 27  45]]
accuracy =  0.7674418604651163
precision =  0.6617647058823529
recall =  0.625
f1 score =  0.6428571428571429


#### 予測

In [25]:
# 年齢：30歳　性別：男性　の生存可否予測
model.predict([[30,1]])  # 0: 生き残れない

array([0])

In [26]:
# 
model.predict_proba([[30,1]])

array([[0.77627898, 0.22372102]])

#### 【予測結果】　30歳男性は、生き残ることはできず、その確率は約７７．６％と予測された。