In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pyspark.sql import SparkSession
import pandas as pd

In [3]:
spark = SparkSession.builder.appName("").getOrCreate()

In [4]:
dfs = []

for i in range(1, 6):
    sheet = "Sheet" + str(i)
    df = pd.read_excel("../data/CCPP/Folds5x2_pp.xlsx", sheet_name=sheet)
    dfs.append(df)

In [5]:
df = dfs[0]

for i in range(1, 5):
    df = df.append(dfs[i])

In [6]:
df

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
9563,15.12,48.92,1011.80,72.93,462.59
9564,33.41,77.95,1010.30,59.72,432.90
9565,15.99,43.34,1014.20,78.66,465.96
9566,17.65,59.87,1018.58,94.65,450.93


In [7]:
df.columns

Index(['AT', 'V', 'AP', 'RH', 'PE'], dtype='object')

In [8]:
from modules.utils import createSparkDfFromXlsx

In [9]:
data = createSparkDfFromXlsx(df, spark)

In [10]:
data.show(5)

+-----+-----+-------+-----+------+
|   AT|    V|     AP|   RH|    PE|
+-----+-----+-------+-----+------+
|14.96|41.76|1024.07|73.17|463.26|
|25.18|62.96|1020.04|59.08|444.37|
| 5.11| 39.4|1012.16|92.14|488.56|
|20.86|57.32|1010.24|76.64|446.48|
|10.82| 37.5|1009.23|96.62| 473.9|
+-----+-----+-------+-----+------+
only showing top 5 rows



In [11]:
data.count()

47840

In [12]:
data = data.dropDuplicates()

In [13]:
data.count()

9527

In [14]:
data = data.dropna()

In [15]:
data.count()

9527

In [16]:
data.printSchema()

root
 |-- AT: double (nullable = true)
 |-- V: double (nullable = true)
 |-- AP: double (nullable = true)
 |-- RH: double (nullable = true)
 |-- PE: double (nullable = true)



# Chuyển dữ liệu

In [17]:
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

In [18]:
assembler = VectorAssembler(inputCols=['AT', 'V', 'AP', 'RH'], outputCol='features')

In [19]:
data = assembler.transform(data).select('features', 'PE')

In [20]:
data.show(3)

+--------------------+------+
|            features|    PE|
+--------------------+------+
|[24.54,60.29,1017...|447.67|
|[10.59,42.49,1009...|477.49|
|[26.7,66.56,1005....|430.21|
+--------------------+------+
only showing top 3 rows



# Scale

In [21]:
from pyspark.ml.feature import StandardScaler

In [22]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withStd=False, withMean=True)

In [23]:
scaler_model = scaler.fit(data)

In [24]:
scale_data = scaler_model.transform(data)

In [25]:
scale_data = scale_data.select('scaled_features', 'PE')

In [26]:
scale_data.show(3)

+--------------------+------+
|     scaled_features|    PE|
+--------------------+------+
|[4.88177495538994...|447.67|
|[-9.0682250446100...|477.49|
|[7.04177495538994...|430.21|
+--------------------+------+
only showing top 3 rows



# Tính correlation

In [32]:
pearson_corr = Correlation.corr(data, 'features').collect()[0][0]

In [35]:
print(str(pearson_corr).replace('nan', 'NaN'))

DenseMatrix([[ 1.        ,  0.84368857, -0.50822164, -0.54394686],
             [ 0.84368857,  1.        , -0.41571837, -0.31221399],
             [-0.50822164, -0.41571837,  1.        ,  0.10163098],
             [-0.54394686, -0.31221399,  0.10163098,  1.        ]])


> * AT và V có tương quan cao

# Tach dữ liệu thành train và test

In [41]:
data = data.withColumnRenamed('PE', 'label')
data.show()

+--------------------+------+
|            features| label|
+--------------------+------+
|[24.54,60.29,1017...|447.67|
|[10.59,42.49,1009...|477.49|
|[26.7,66.56,1005....|430.21|
|[21.24,41.67,1012...|459.81|
|[27.74,74.78,1010...|436.87|
|[9.08,36.71,1025....|479.02|
|[25.06,65.46,1014...|443.03|
|[30.2,73.67,1006....|428.72|
|[16.53,46.18,1010...|458.67|
|[22.55,70.79,1006...|436.43|
|[17.66,60.08,1017...|456.62|
|[23.17,62.39,1008...|439.29|
|[23.89,65.18,1012...|438.97|
|[26.2,65.18,1011....|440.07|
|[26.01,74.33,1015...|435.82|
|[28.26,72.43,1006...| 426.2|
|[15.37,43.34,1014...|460.02|
|[15.4,38.73,1000....|469.18|
|[28.15,72.99,1007...|431.83|
|[23.78,49.3,1003....|439.83|
+--------------------+------+
only showing top 20 rows



In [42]:
train, test = data.randomSplit((0.8, 0.2))

# Build model

In [43]:
from pyspark.ml.regression import LinearRegression

In [44]:
lr = LinearRegression()

In [45]:
lr_model = lr.fit(train)

In [46]:
preditions_0 = lr_model.transform(test)

In [47]:
preditions_0.show(5)

+--------------------+------+------------------+
|            features| label|        prediction|
+--------------------+------+------------------+
|[6.71,40.96,1022....|486.58| 480.7724716363177|
|[15.73,38.73,1002...|466.63|464.61261024512356|
|[30.95,73.06,1008...|431.77|427.08816241057093|
|[5.04,40.64,1021....|484.42|  483.802404961832|
|[6.57,42.07,1004....|483.86|482.43962391931274|
+--------------------+------+------------------+
only showing top 5 rows



# Đánh giá

In [48]:
model_prediction = lr_model.evaluate(test)

In [49]:
model_prediction.r2

0.9238525403528323

In [50]:
model_prediction.meanSquaredError

21.950333659881906

In [51]:
model_prediction.rootMeanSquaredError

4.685118318664098

> **Nhận xét**
> * Kết quả rất tốt