# ロジスティック回帰による分類予測

#### bank-fullのy列(定期預金の申込み可否)についての分類予測
#### 特徴量は、integer,string,"default"のみ使用する
#### integer列は標準化を行う
#### stringの列はindex化する
#### 精度評価として混同行列とAUCを選択

In [1]:
import numpy as np
import pandas as pd

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("logistic_regression").getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0xffff61c8b010>


In [3]:
filename = "./data/bank-full.csv"
df = spark.read.csv(filename, header=True, inferSchema=True, sep=';')
df.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [4]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [5]:
# 目的変数の作成 yes:1, no:0
from pyspark.sql.functions import lit, when, col
df1 = df.withColumn("y1", when(col("y")=="yes", lit(1.0)).otherwise(lit(0.0)))
df1 = df1.withColumnRenamed("default", "def")
df1.show()

+---+------------+--------+---------+---+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+---+
|age|         job| marital|education|def|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y| y1|
+---+------------+--------+---------+---+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+---+
| 58|  management| married| tertiary| no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|0.0|
| 44|  technician|  single|secondary| no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|0.0|
| 33|entrepreneur| married|secondary| no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|0.0|
| 47| blue-collar| married|  unknown| no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|0.0|
| 33|     unknown|  single|  unknown| no|      1|     no|  no|unknown|  5|  may|   

In [6]:
# stringの処理 (default)
from pyspark.ml.feature import StringIndexer
default_index = StringIndexer(inputCol="def", outputCol="default_index")

In [7]:
# assemble
from pyspark.ml.feature import VectorAssembler
assemble = VectorAssembler(inputCols=["age", "balance", "duration", "campaign", "previous", "default_index"], outputCol="features")

In [8]:
# 標準化
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

In [9]:
# ロジスティック回帰
from pyspark.ml.classification import LogisticRegression
logistic_regression = LogisticRegression(featuresCol="scaled_features", labelCol="y1")

In [10]:
# パイプラインの登録
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[default_index, assemble, scaler, logistic_regression])

In [11]:
df = df1.select("age", "balance", "duration", "campaign", "previous", "def", "y", "y1")

In [12]:
# 訓練データとテストデータを分ける
train_df, test_df = df.randomSplit([0.7, 0.3], seed=1234)

In [13]:
train_df.dtypes

[('age', 'int'),
 ('balance', 'int'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('previous', 'int'),
 ('def', 'string'),
 ('y', 'string'),
 ('y1', 'double')]

In [14]:
# 訓練データによるモデリング
fit_model = pipeline.fit(train_df)

In [15]:
fit_model.stages[3].coefficients

DenseVector([0.0461, 0.1101, 0.9227, -0.4444, 0.2843, -0.0603])

In [16]:
fit_model.stages[3].intercept

-3.2359670441544637

In [17]:
# 訓練データを使って推論
pred_train = fit_model.transform(train_df)

In [18]:
pred_train.show()

+---+-------+--------+--------+--------+---+---+---+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|age|balance|duration|campaign|previous|def|  y| y1|default_index|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---+-------+--------+--------+--------+---+---+---+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| 18|     35|     104|       2|       0| no| no|0.0|          0.0|[18.0,35.0,104.0,...|[1.69579509967235...|[3.06350348098030...|[0.95536194323606...|       0.0|
| 18|    108|      92|       1|       1| no|yes|1.0|          0.0|[18.0,108.0,92.0,...|[1.69579509967235...|[2.81052152711537...|[0.94324174651528...|       0.0|
| 18|    108|     169|       1|       0| no|yes|1.0|          0.0|[18.0,108.0,169.0...|[1.69579509967235...|[2.68263210486711...|[0.93599399206992...|       0.0|
| 18|    156|     298|      

In [19]:
pred_train.select("rawPrediction", "probability").show(truncate=False)

+----------------------------------------+-----------------------------------------+
|rawPrediction                           |probability                              |
+----------------------------------------+-----------------------------------------+
|[3.0635034809803074,-3.0635034809803074]|[0.9553619432360639,0.04463805676393606] |
|[2.81052152711537,-2.81052152711537]    |[0.9432417465152872,0.05675825348471275] |
|[2.682632104867112,-2.682632104867112]  |[0.9359939920699278,0.06400600793007216] |
|[1.7481311839187106,-1.7481311839187106]|[0.8517169345743542,0.14828306542564584] |
|[2.105961860368692,-2.105961860368692]  |[0.8914812910541504,0.10851870894584958] |
|[1.7413142898799685,-1.7413142898799685]|[0.8508539277745946,0.14914607222540543] |
|[2.3641117736081885,-2.3641117736081885]|[0.9140493896360995,0.0859506103639005]  |
|[2.3088349021877326,-2.3088349021877326]|[0.90960610335261,0.09039389664738995]   |
|[3.0716978019959584,-3.0716978019959584]|[0.9557100931299582,0.0

In [20]:
z = np.array([3.0635034809803074,-3.0635034809803074]) # z: 線形予測子
q = 1 / (1 + np.exp(-z)) # ロジスティック関数
print(q)

[0.95536194 0.04463806]


In [21]:
# 精度評価　（訓練データ） -> AUCを使用
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol = "y1")
AUC = evaluator.evaluate(pred_train)
print(AUC)

0.8331547859811365


#### テストデータ

In [22]:
test_df.show()

+---+-------+--------+--------+--------+---+---+---+
|age|balance|duration|campaign|previous|def|  y| y1|
+---+-------+--------+--------+--------+---+---+---+
| 18|      3|     130|       2|       0| no|yes|1.0|
| 18|      5|     143|       2|       0| no| no|0.0|
| 18|    108|     167|       1|       0| no|yes|1.0|
| 19|    103|      96|       2|       2| no| no|0.0|
| 19|    108|     168|       1|       2| no|yes|1.0|
| 19|    108|     273|       2|       1| no|yes|1.0|
| 19|    134|     271|       2|       0| no|yes|1.0|
| 19|    179|      62|       3|       0| no| no|0.0|
| 19|    291|     291|       5|       0| no| no|0.0|
| 19|    329|     169|       1|       2| no|yes|1.0|
| 19|    329|     252|       2|       0| no|yes|1.0|
| 19|    526|     122|       3|       0| no| no|0.0|
| 19|   1803|     124|       1|       1| no| no|0.0|
| 20|   -322|      73|       4|       0| no| no|0.0|
| 20|     29|      85|       2|       0| no| no|0.0|
| 20|     76|     639|       2|       0| no|ye

In [23]:
test_df.count()

13562

In [24]:
# テストデータの推論
pred_test = fit_model.transform(test_df)

In [25]:
pred_test.show()

+---+-------+--------+--------+--------+---+---+---+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|age|balance|duration|campaign|previous|def|  y| y1|default_index|            features|     scaled_features|       rawPrediction|         probability|prediction|
+---+-------+--------+--------+--------+---+---+---+-------------+--------------------+--------------------+--------------------+--------------------+----------+
| 18|      3|     130|       2|       0| no|yes|1.0|          0.0|[18.0,3.0,130.0,2...|[1.69579509967235...|[2.97025888993604...|[0.95121229283086...|       0.0|
| 18|      5|     143|       2|       0| no| no|0.0|          0.0|[18.0,5.0,143.0,2...|[1.69579509967235...|[2.92298874908348...|[0.94897122314169...|       0.0|
| 18|    108|     167|       1|       0| no|yes|1.0|          0.0|[18.0,108.0,167.0...|[1.69579509967235...|[2.68989335994989...|[0.93642763358003...|       0.0|
| 19|    103|      96|      

In [27]:
# 精度評価 (訓練データ) -> AUCを利用
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='y1')
AUC = evaluator.evaluate(pred_test)
print(AUC)

0.8321723113642512


#### 混同行列

In [29]:
from sklearn.metrics import confusion_matrix
y_true = pred_test.select("y1")
y_true = y_true.toPandas()
y_true

Unnamed: 0,y1
0,1.0
1,0.0
2,1.0
3,0.0
4,1.0
...,...
13557,0.0
13558,1.0
13559,1.0
13560,0.0


In [30]:
y_pred = pred_test.select("prediction")
y_pred = y_pred.toPandas()
y_pred

Unnamed: 0,prediction
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
13557,0.0
13558,1.0
13559,1.0
13560,0.0


In [32]:
class_name = [0, 1]
cnf_matrix = confusion_matrix(y_true, y_pred, labels=class_name)
cnf_matrix

array([[11749,   214],
       [ 1304,   295]])

In [33]:
tn, fp, fn, tp = cnf_matrix.flatten()
print(tn, fp, fn, tp)

11749 214 1304 295


In [34]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("accuracy:{}".format(accuracy_score(y_true, y_pred)))

accuracy:0.8880696062527651


In [35]:
print("precision:{}".format(precision_score(y_true, y_pred)))
print("recall:{}".format(recall_score(y_true, y_pred)))
print("f1:{}".format(f1_score(y_true, y_pred)))

precision:0.5795677799607073
recall:0.18449030644152595
f1:0.27988614800759015
