# 線形重回帰による数値予測

#### 1. データチェック
#### 2. データ作成
#### 3. モデリング
#### 4. 係数や精度指標の確認

In [None]:
from pyspark.sql import SparkSession
import psycopg2
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /usr/local/spark/jars/postgresql-42.7.3.jar pyspark-shell'

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("linear_regression") \
    .getOrCreate()

In [3]:
df = spark.read.csv("./data/bank-full.csv", header=True, inferSchema=True, sep=";")
df.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [4]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [5]:
conn = psycopg2.connect(
    host="jupyter-pyspark-postgres",
    user="user",
    password="password",
    database="jupyter_pyspark_db"
)

In [6]:
cur = conn.cursor()

In [7]:
# 再実行時にPostgreSQLのデータが存在する場合にリセット (データの重複増加を回避)
cur.execute("DROP TABLE IF EXISTS bank_table")

In [8]:
cur.execute("""
   CREATE TABLE IF NOT EXISTS bank_table (
       age INTEGER,
       job VARCHAR(255),
       marital VARCHAR(255),
       education VARCHAR(255),
       "default" VARCHAR(255),
       balance INTEGER,
       housing VARCHAR(255),
       loan VARCHAR(255),
       contact VARCHAR(255),
       day INTEGER,
       month VARCHAR(255),
       duration INTEGER,
       campaign INTEGER,
       pdays INTEGER,
       previous INTEGER,
       poutcome VARCHAR(255),
       y VARCHAR(255)
   ) 
""")

In [9]:
conn.commit()
cur.close()
conn.close()

In [10]:
def insert_data(df, table_name):
    df.write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://jupyter-pyspark-postgres:5432/jupyter_pyspark_db") \
        .option("dbtable", table_name) \
        .option("user", "user") \
        .option("password", "password") \
        .mode("append") \
        .save()

In [11]:
insert_data(df, "bank_table")

In [12]:
query = "SELECT * FROM bank_table" 

In [13]:
df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://jupyter-pyspark-postgres:5432/jupyter_pyspark_db") \
    .option("dbtable", f"({query}) as tmp") \
    .option("user", "user") \
    .option("password", "password") \
    .load()

In [14]:
df.show()

+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
|age|         job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+------------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+---+
| 58|  management| married| tertiary|     no|   2143|    yes|  no|unknown|  5|  may|     261|       1|   -1|       0| unknown| no|
| 44|  technician|  single|secondary|     no|     29|    yes|  no|unknown|  5|  may|     151|       1|   -1|       0| unknown| no|
| 33|entrepreneur| married|secondary|     no|      2|    yes| yes|unknown|  5|  may|      76|       1|   -1|       0| unknown| no|
| 47| blue-collar| married|  unknown|     no|   1506|    yes|  no|unknown|  5|  may|      92|       1|   -1|       0| unknown| no|
| 33|     unknown|  single|  unknown|     no|      1|     no|  no|unknown|  5|  may

In [15]:
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'int'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('y', 'string')]

In [16]:
df.summary().show()

+-------+------------------+-------+--------+---------+-------+------------------+-------+-----+--------+-----------------+-----+-----------------+-----------------+------------------+------------------+--------+-----+
|summary|               age|    job| marital|education|default|           balance|housing| loan| contact|              day|month|         duration|         campaign|             pdays|          previous|poutcome|    y|
+-------+------------------+-------+--------+---------+-------+------------------+-------+-----+--------+-----------------+-----+-----------------+-----------------+------------------+------------------+--------+-----+
|  count|             45211|  45211|   45211|    45211|  45211|             45211|  45211|45211|   45211|            45211|45211|            45211|            45211|             45211|             45211|   45211|45211|
|   mean| 40.93621021432837|   NULL|    NULL|     NULL|   NULL|1362.2720576850766|   NULL| NULL|    NULL|15.80641879188693| 

In [17]:
# データ作成
linear_df = df.select(["age", "balance", "campaign"])
target = "balance"
features = ["age", "campaign"]
train_df = df.select(features)

In [18]:
features

['age', 'campaign']

In [19]:
# データ作成ステージ
from pyspark.ml.feature import VectorAssembler
assemble = VectorAssembler(inputCols=features, outputCol="features")

In [20]:
# 線形重回帰のモデリングステージ
from pyspark.ml.regression import LinearRegression
clf = LinearRegression(featuresCol="features", labelCol="balance")

In [21]:
# パイプラインの設定
from pyspark.ml.pipeline import Pipeline
pipline = Pipeline(stages=[assemble, clf])
model = pipline.fit(linear_df)

In [22]:
# パイプラインの実行
df = model.transform(linear_df)
df.show()

+---+-------+--------+----------+------------------+
|age|balance|campaign|  features|        prediction|
+---+-------+--------+----------+------------------+
| 58|   2143|       1|[58.0,1.0]|1867.1309208276969|
| 44|     29|       1|[44.0,1.0]| 1474.315799038966|
| 33|      2|       1|[33.0,1.0]| 1165.675346204963|
| 47|   1506|       1|[47.0,1.0]| 1558.490467993694|
| 33|      1|       1|[33.0,1.0]| 1165.675346204963|
| 35|    231|       1|[35.0,1.0]| 1221.791792174782|
| 28|    447|       1|[28.0,1.0]|1025.3842312804163|
| 42|      2|       1|[42.0,1.0]|1418.1993530691473|
| 58|    121|       1|[58.0,1.0]|1867.1309208276969|
| 43|    593|       1|[43.0,1.0]|1446.2575760540567|
| 41|    270|       1|[41.0,1.0]|1390.1411300842378|
| 29|    390|       1|[29.0,1.0]|1053.4424542653258|
| 53|      6|       1|[53.0,1.0]|1726.8398059031501|
| 58|     71|       1|[58.0,1.0]|1867.1309208276969|
| 57|    162|       1|[57.0,1.0]|1839.0726978427876|
| 51|    229|       1|[51.0,1.0]|1670.72335993

In [23]:
# 係数の確認
model.stages[1].coefficients

DenseVector([28.0582, -14.7855])

In [24]:
# 切片
model.stages[1].intercept

254.53947540939365

# 線形重回帰のモデリングと予測

#### 学習データとテストデータに分ける

In [25]:
type(df)

pyspark.sql.dataframe.DataFrame

In [26]:
df.count()

45211

In [27]:
rate1 = df.count() * 0.7
rate1

31647.699999999997

In [28]:
train_df, test_df = df.select(["age", "balance", "campaign"]) \
    .randomSplit([0.7, 0.3], seed=1)

In [29]:
train_df.show()

+---+-------+--------+
|age|balance|campaign|
+---+-------+--------+
| 18|      3|       2|
| 18|      5|       2|
| 18|     35|       2|
| 18|    108|       1|
| 18|    156|       2|
| 18|    348|       4|
| 18|    438|       1|
| 18|    608|       1|
| 18|    608|       1|
| 18|   1944|       3|
| 19|      0|       3|
| 19|     60|       1|
| 19|     88|       1|
| 19|     96|       3|
| 19|    103|       2|
| 19|    103|       2|
| 19|    103|       2|
| 19|    108|       1|
| 19|    108|       2|
| 19|    134|       2|
+---+-------+--------+
only showing top 20 rows



In [30]:
train_df.count()

31676

#### 学習データでモデリング

In [31]:
# データ作成ステージ
from pyspark.ml.feature import VectorAssembler

target = "balance"
features = ["age", "campaign"]
assemble = VectorAssembler(inputCols=features, outputCol="features")

In [32]:
# 線形重回帰ステージ
from pyspark.ml.regression import LinearRegression
clf = LinearRegression(featuresCol="features", labelCol="balance")

In [33]:
# パイプライン登録
from pyspark.ml.pipeline import Pipeline
pipeline = Pipeline(stages=[assemble, clf])
model = pipeline.fit(train_df)

In [34]:
# 実行
pred_train = model.transform(train_df)
pred_train.show()

+---+-------+--------+----------+-----------------+
|age|balance|campaign|  features|       prediction|
+---+-------+--------+----------+-----------------+
| 18|      3|       2|[18.0,2.0]| 710.845257267865|
| 18|      5|       2|[18.0,2.0]| 710.845257267865|
| 18|     35|       2|[18.0,2.0]| 710.845257267865|
| 18|    108|       1|[18.0,1.0]|723.7064702721568|
| 18|    156|       2|[18.0,2.0]| 710.845257267865|
| 18|    348|       4|[18.0,4.0]|685.1228312592813|
| 18|    438|       1|[18.0,1.0]|723.7064702721568|
| 18|    608|       1|[18.0,1.0]|723.7064702721568|
| 18|    608|       1|[18.0,1.0]|723.7064702721568|
| 18|   1944|       3|[18.0,3.0]|697.9840442635732|
| 19|      0|       3|[19.0,3.0]|726.5301363665997|
| 19|     60|       1|[19.0,1.0]|752.2525623751833|
| 19|     88|       1|[19.0,1.0]|752.2525623751833|
| 19|     96|       3|[19.0,3.0]|726.5301363665997|
| 19|    103|       2|[19.0,2.0]|739.3913493708915|
| 19|    103|       2|[19.0,2.0]|739.3913493708915|
| 19|    103

In [35]:
# RMSE (sklearn)
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

pred_train_pandas = pred_train.toPandas()
np.sqrt(mean_squared_error(pred_train_pandas["balance"], pred_train_pandas["prediction"]))

2954.7856615252667

In [36]:
# 係数
train_cols = train_df.columns
train_cols.remove(target)
pd.DataFrame(index=train_cols, data=model.stages[1].coefficients, columns=["coefficients"])

Unnamed: 0,coefficients
age,28.546092
campaign,-12.861213


#### テストデータによる予測

In [37]:
test_df.show()

+---+-------+--------+
|age|balance|campaign|
+---+-------+--------+
| 18|    108|       1|
| 18|    108|       1|
| 19|      0|       4|
| 19|      4|       1|
| 19|     27|      12|
| 19|     55|       2|
| 19|     56|       1|
| 19|    291|       5|
| 19|    329|       2|
| 19|    372|       3|
| 19|    424|       3|
| 19|    608|       1|
| 19|   1169|      18|
| 19|   1247|       1|
| 19|   1803|       1|
| 20|      0|       5|
| 20|     66|       2|
| 20|     88|       1|
| 20|    167|       1|
| 20|    215|       1|
+---+-------+--------+
only showing top 20 rows



In [38]:
test_df.count()

13535

In [39]:
pred_test = model.transform(test_df)
pred_test.show()

+---+-------+--------+-----------+-----------------+
|age|balance|campaign|   features|       prediction|
+---+-------+--------+-----------+-----------------+
| 18|    108|       1| [18.0,1.0]|723.7064702721568|
| 18|    108|       1| [18.0,1.0]|723.7064702721568|
| 19|      0|       4| [19.0,4.0]|713.6689233623077|
| 19|      4|       1| [19.0,1.0]|752.2525623751833|
| 19|     27|      12|[19.0,12.0]|610.7792193279731|
| 19|     55|       2| [19.0,2.0]|739.3913493708915|
| 19|     56|       1| [19.0,1.0]|752.2525623751833|
| 19|    291|       5| [19.0,5.0]|700.8077103580159|
| 19|    329|       2| [19.0,2.0]|739.3913493708915|
| 19|    372|       3| [19.0,3.0]|726.5301363665997|
| 19|    424|       3| [19.0,3.0]|726.5301363665997|
| 19|    608|       1| [19.0,1.0]|752.2525623751833|
| 19|   1169|      18|[19.0,18.0]|533.6119413022222|
| 19|   1247|       1| [19.0,1.0]|752.2525623751833|
| 19|   1803|       1| [19.0,1.0]|752.2525623751833|
| 20|      0|       5| [20.0,5.0]|729.35380246

In [40]:
# RMSE (sklearn)
pred_test_pandas = pred_test.toPandas()
np.sqrt(mean_squared_error(pred_test_pandas["balance"], pred_test_pandas["prediction"]))

3198.525889052522