# APACHE SPARK

### Dzień 2

#### Spark SQL + Spark ML

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [2]:
spark = SparkSession.builder.appName('my_app').master("local[*]").getOrCreate()

### Wektory

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml import feature

**Dwa typy wektorów:**
- sparse - większość wartości to zera więc w celu optymalizacji zajmowanej pamięci podawane są tylko indeksy (wraz z wartościami) gdzie wartość != 0
- dense - podane są wszystkie wartości

In [4]:
data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),"A",1),
        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),"B",6),
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),"A",3),
        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),"B",2)]
dummy_df = spark.createDataFrame(data, ["features", "categ", "num"])
dummy_df.show()

+--------------------+-----+---+
|            features|categ|num|
+--------------------+-----+---+
|(4,[0,3],[1.0,-2.0])|    A|  1|
|   [4.0,5.0,0.0,3.0]|    B|  6|
|   [6.0,7.0,0.0,8.0]|    A|  3|
| (4,[0,3],[9.0,1.0])|    B|  2|
+--------------------+-----+---+



In [5]:
dummy_df.printSchema()

root
 |-- features: vector (nullable = true)
 |-- categ: string (nullable = true)
 |-- num: long (nullable = true)



In [6]:
# macierz korelacji
print(Correlation.corr(dummy_df, "features").head()[0])

DenseMatrix([[1.        , 0.05564149,        nan, 0.40047142],
             [0.05564149, 1.        ,        nan, 0.91359586],
             [       nan,        nan, 1.        ,        nan],
             [0.40047142, 0.91359586,        nan, 1.        ]])


In [7]:
Correlation.corr(dummy_df, "features").head()[0][0,1]

0.055641488407465814

**Przechodzenie od kolumny kategorycznej do wektora**

In [8]:
indexer = feature.StringIndexer(inputCol="categ", outputCol="categIndex")

In [9]:
IDXmodel = indexer.fit(dummy_df)

In [10]:
dummy_df1 = IDXmodel.transform(dummy_df)
dummy_df1.show()

+--------------------+-----+---+----------+
|            features|categ|num|categIndex|
+--------------------+-----+---+----------+
|(4,[0,3],[1.0,-2.0])|    A|  1|       1.0|
|   [4.0,5.0,0.0,3.0]|    B|  6|       0.0|
|   [6.0,7.0,0.0,8.0]|    A|  3|       1.0|
| (4,[0,3],[9.0,1.0])|    B|  2|       0.0|
+--------------------+-----+---+----------+



In [11]:
OHencoder = feature.OneHotEncoderEstimator(inputCols=["categIndex"], outputCols=["categVect"])

In [12]:
OHmodel = OHencoder.fit(dummy_df1)

In [13]:
dummy_df2 = OHmodel.transform(dummy_df1)
dummy_df2.show()

+--------------------+-----+---+----------+-------------+
|            features|categ|num|categIndex|    categVect|
+--------------------+-----+---+----------+-------------+
|(4,[0,3],[1.0,-2.0])|    A|  1|       1.0|    (1,[],[])|
|   [4.0,5.0,0.0,3.0]|    B|  6|       0.0|(1,[0],[1.0])|
|   [6.0,7.0,0.0,8.0]|    A|  3|       1.0|    (1,[],[])|
| (4,[0,3],[9.0,1.0])|    B|  2|       0.0|(1,[0],[1.0])|
+--------------------+-----+---+----------+-------------+



**Łączenie zmiennych w wektory**

In [14]:
vectAssembler = feature.VectorAssembler(inputCols = ["features", "num", "categVect"], outputCol = "featuresFull")
dummy_df3 = vectAssembler.transform(dummy_df2)
dummy_df3.show(truncate=False)

+--------------------+-----+---+----------+-------------+--------------------------+
|features            |categ|num|categIndex|categVect    |featuresFull              |
+--------------------+-----+---+----------+-------------+--------------------------+
|(4,[0,3],[1.0,-2.0])|A    |1  |1.0       |(1,[],[])    |[1.0,0.0,0.0,-2.0,1.0,0.0]|
|[4.0,5.0,0.0,3.0]   |B    |6  |0.0       |(1,[0],[1.0])|[4.0,5.0,0.0,3.0,6.0,1.0] |
|[6.0,7.0,0.0,8.0]   |A    |3  |1.0       |(1,[],[])    |[6.0,7.0,0.0,8.0,3.0,0.0] |
|(4,[0,3],[9.0,1.0]) |B    |2  |0.0       |(1,[0],[1.0])|[9.0,0.0,0.0,1.0,2.0,1.0] |
+--------------------+-----+---+----------+-------------+--------------------------+



**Normalizacja / skalowanie zmiennych**

L2

In [16]:
normalizerL2 = feature.Normalizer(inputCol="featuresFull", outputCol="featuresNorm")

In [17]:
dummy_df3.select("featuresFull").show(truncate=False)
normalizerL2.transform(dummy_df3).select("featuresNorm").show(truncate=False)

+--------------------------+
|featuresFull              |
+--------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0]|
|[4.0,5.0,0.0,3.0,6.0,1.0] |
|[6.0,7.0,0.0,8.0,3.0,0.0] |
|[9.0,0.0,0.0,1.0,2.0,1.0] |
+--------------------------+

+-----------------------------------------------------------------------------------------------------+
|featuresNorm                                                                                         |
+-----------------------------------------------------------------------------------------------------+
|[0.4082482904638631,0.0,0.0,-0.8164965809277261,0.4082482904638631,0.0]                              |
|[0.4288450139351179,0.5360562674188973,0.0,0.3216337604513384,0.6432675209026768,0.10721125348377948]|
|[0.47733437050543803,0.556890098923011,0.0,0.636445827340584,0.23866718525271902,0.0]                |
|[0.9649012813540153,0.0,0.0,0.10721125348377948,0.21442250696755896,0.10721125348377948]             |
+--------------------------------------

L1

In [18]:
normalizerL1 = feature.Normalizer(inputCol="featuresFull", outputCol="featuresNorm", p=1.0)
dummy_df3.select("featuresFull").show(truncate=False)
normalizerL1.transform(dummy_df3).select("featuresNorm").show(truncate=False)

+--------------------------+
|featuresFull              |
+--------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0]|
|[4.0,5.0,0.0,3.0,6.0,1.0] |
|[6.0,7.0,0.0,8.0,3.0,0.0] |
|[9.0,0.0,0.0,1.0,2.0,1.0] |
+--------------------------+

+-------------------------------------------------------------------------------------------------------+
|featuresNorm                                                                                           |
+-------------------------------------------------------------------------------------------------------+
|[0.25,0.0,0.0,-0.5,0.25,0.0]                                                                           |
|[0.21052631578947367,0.2631578947368421,0.0,0.15789473684210525,0.3157894736842105,0.05263157894736842]|
|[0.25,0.2916666666666667,0.0,0.3333333333333333,0.125,0.0]                                             |
|[0.6923076923076923,0.0,0.0,0.07692307692307693,0.15384615384615385,0.07692307692307693]               |
+------------------------

Skalowanie

In [19]:
scaler = feature.StandardScaler(inputCol="featuresFull", outputCol="featuresScal")

In [20]:
scalerModel = scaler.fit(dummy_df3)

In [21]:
dummy_df3.select("featuresFull").show(truncate=False)
scalerModel.transform(dummy_df3).select("featuresScal").show(truncate=False)

+--------------------------+
|featuresFull              |
+--------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0]|
|[4.0,5.0,0.0,3.0,6.0,1.0] |
|[6.0,7.0,0.0,8.0,3.0,0.0] |
|[9.0,0.0,0.0,1.0,2.0,1.0] |
+--------------------------+

+---------------------------------------------------------------------------------------------------+
|featuresScal                                                                                       |
+---------------------------------------------------------------------------------------------------+
|[0.2970442628930023,0.0,0.0,-0.47583095143088644,0.4629100498862757,0.0]                           |
|[1.1881770515720091,1.404878717372541,0.0,0.7137464271463296,2.7774602993176543,1.7320508075688774]|
|[1.7822655773580136,1.9668302043215575,0.0,1.9033238057235458,1.3887301496588271,0.0]              |
|[2.6733983660370204,0.0,0.0,0.23791547571544322,0.9258200997725514,1.7320508075688774]             |
+----------------------------------------------------

Skalowanie min-max *(w wyniku transformacji powstaje DenseVector)*

In [22]:
MMscaler = feature.MinMaxScaler(inputCol="featuresFull", outputCol="featuresScal")

In [23]:
MMscalerModel = MMscaler.fit(dummy_df3)

In [24]:
dummy_df3.select("featuresFull").show(truncate=False)
MMscalerModel.transform(dummy_df3).select("featuresScal").show(truncate=False)

+--------------------------+
|featuresFull              |
+--------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0]|
|[4.0,5.0,0.0,3.0,6.0,1.0] |
|[6.0,7.0,0.0,8.0,3.0,0.0] |
|[9.0,0.0,0.0,1.0,2.0,1.0] |
+--------------------------+

+------------------------------------------+
|featuresScal                              |
+------------------------------------------+
|[0.0,0.0,0.5,0.0,0.0,0.0]                 |
|[0.375,0.7142857142857143,0.5,0.5,1.0,1.0]|
|[0.625,1.0,0.5,1.0,0.4,0.0]               |
|[1.0,0.0,0.5,0.3,0.2,1.0]                 |
+------------------------------------------+



**PCA**

In [25]:
pca = feature.PCA(k=3, inputCol="featuresFull", outputCol="featuresPCA")

In [26]:
PCAmodel = pca.fit(dummy_df3)

In [27]:
PCAmodel.transform(dummy_df3).select("featuresFull", "featuresPCA").show(truncate=False)

+--------------------------+----------------------------------------------------------+
|featuresFull              |featuresPCA                                               |
+--------------------------+----------------------------------------------------------+
|[1.0,0.0,0.0,-2.0,1.0,0.0]|[1.023488773489829,0.63446709320338,-1.7496631681306285]  |
|[4.0,5.0,0.0,3.0,6.0,1.0] |[-7.435741063599587,1.099445815146323,-5.386313460836334] |
|[6.0,7.0,0.0,8.0,3.0,0.0] |[-12.10504284268931,2.936501175508072,-1.1646019004907298]|
|[9.0,0.0,0.0,1.0,2.0,1.0] |[-3.233791315365656,8.019877441717323,-3.2765162157739773]|
+--------------------------+----------------------------------------------------------+



> **ZADANIE:**
- przygotuj poniższe dane
- usuń wiersze zawierające braki danych
- stwórz kolumnę zawierającą miesiąc wyciągnięty z kolumny `start_time`
- stwórz kolumnę zawierającą informację o godzinie o której wystąpiło wyporzyczenie
- stwórz kolumnę zawierającą informacje o przedziale wiekowym wyporzyczającego (przedziały: <20, 20-40, 40-60, 60<)
- zaokrąglij do jednego miejsca po przecinku wartości w kolumnach `start_station_longitude` oraz `start_station_latitude`
- usuń kolumny: `start_time`, `end_time`, `start_station_name`, `start_station_id`, `end_station_id`, `end_station_name`, `end_station_latitude`, `end_station_longitude`, `member_birth_year`, `bike_id`
- zmień nazwę kolumny `duration_sec` na `label`
- z pozostałych zmiennych stwórz kolumnę `features` zawierającą wektory
- wynikowemu DataFrameowi nadaj nazwę `goBike_processed`

In [46]:
goBike = spark.read.csv("./2017-fordgobike-tripdata.csv", header=True, inferSchema=True)

In [29]:
goBike.printSchema()

root
 |-- duration_sec: integer (nullable = true)
 |-- start_time: timestamp (nullable = true)
 |-- end_time: timestamp (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_latitude: double (nullable = true)
 |-- start_station_longitude: double (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_latitude: double (nullable = true)
 |-- end_station_longitude: double (nullable = true)
 |-- bike_id: integer (nullable = true)
 |-- user_type: string (nullable = true)
 |-- member_birth_year: integer (nullable = true)
 |-- member_gender: string (nullable = true)



In [79]:
goBike_proc = goBike.dropna()

In [80]:
goBike_proc = goBike_proc.withColumn('month',f.month('start_time'))

In [81]:
goBike_proc.columns

['duration_sec',
 'start_time',
 'end_time',
 'start_station_id',
 'start_station_name',
 'start_station_latitude',
 'start_station_longitude',
 'end_station_id',
 'end_station_name',
 'end_station_latitude',
 'end_station_longitude',
 'bike_id',
 'user_type',
 'member_birth_year',
 'member_gender',
 'month']

In [82]:
goBike_proc = goBike_proc.withColumn('hour',f.hour('start_time'))

In [83]:
goBike_proc = goBike_proc.withColumn('age',2018-goBike['member_birth_year'])

In [84]:
goBike_proc = goBike_proc.withColumn('age_range',f.when(goBike_proc['age']<20,'<20')\
                       .when((goBike_proc['age']>=20) & (goBike_proc['age']<40),'20-40')\
                       .when((goBike_proc['age']>=40) & (goBike_proc['age']<60),'40-60')\
                       .otherwise('60<'))

In [85]:
goBike_proc.columns

['duration_sec',
 'start_time',
 'end_time',
 'start_station_id',
 'start_station_name',
 'start_station_latitude',
 'start_station_longitude',
 'end_station_id',
 'end_station_name',
 'end_station_latitude',
 'end_station_longitude',
 'bike_id',
 'user_type',
 'member_birth_year',
 'member_gender',
 'month',
 'hour',
 'age',
 'age_range']

In [86]:
goBike_proc = goBike_proc.withColumn('start_station_latitude',f.round('start_station_latitude',1))

In [87]:
goBike_proc = goBike_proc.withColumn('start_station_longitude',f.round('start_station_longitude',1))

In [88]:
goBike_proc = goBike_proc.drop('start_time', 'end_time', 'start_station_name', 'start_station_id',
                  'end_station_id', 'end_station_name', 'end_station_latitude',
                  'end_station_longitude', 'member_birth_year', 'bike_id')

In [89]:
goBike_proc = goBike_proc.drop('age')

In [90]:
goBike_proc.columns

['duration_sec',
 'start_station_latitude',
 'start_station_longitude',
 'user_type',
 'member_gender',
 'month',
 'hour',
 'age_range']

In [91]:
goBike_proc = goBike_proc.withColumnRenamed('duration_sec','label')

In [92]:
goBike_proc.columns

['label',
 'start_station_latitude',
 'start_station_longitude',
 'user_type',
 'member_gender',
 'month',
 'hour',
 'age_range']

In [93]:
goBike_proc.show()

+-----+----------------------+-----------------------+----------+-------------+-----+----+---------+
|label|start_station_latitude|start_station_longitude| user_type|member_gender|month|hour|age_range|
+-----+----------------------+-----------------------+----------+-------------+-----+----+---------+
|80110|                  37.8|                 -122.4|  Customer|         Male|   12|  16|    20-40|
|78800|                  37.8|                 -122.4|  Customer|       Female|   12|  15|    40-60|
|43603|                  37.9|                 -122.3|Subscriber|       Female|   12|  14|    20-40|
| 4507|                  37.9|                 -122.3|  Customer|       Female|   12|  23|    20-40|
| 2183|                  37.8|                 -122.4|Subscriber|         Male|   12|  23|    20-40|
| 2170|                  37.8|                 -122.4|Subscriber|         Male|   12|  23|    20-40|
| 1544|                  37.8|                 -122.4|Subscriber|       Female|   12|  23| 

In [94]:
goBike_columns = goBike_proc.columns[1:]
for inputColName in goBike_columns:
    
    outputColName = inputColName+ '_index'

    indexer = feature.StringIndexer(inputCol=inputColName, outputCol=outputColName)
    idx_model = indexer.fit(goBike_proc)
    goBike_proc = idx_model.transform(goBike_proc)

In [95]:
goBike_columns

['start_station_latitude',
 'start_station_longitude',
 'user_type',
 'member_gender',
 'month',
 'hour',
 'age_range']

In [96]:
goBike_proc.columns

['label',
 'start_station_latitude',
 'start_station_longitude',
 'user_type',
 'member_gender',
 'month',
 'hour',
 'age_range',
 'start_station_latitude_index',
 'start_station_longitude_index',
 'user_type_index',
 'member_gender_index',
 'month_index',
 'hour_index',
 'age_range_index']

In [103]:
inputCols = goBike_proc.columns[8:]

In [104]:
inputCols

['start_station_latitude_index',
 'start_station_longitude_index',
 'user_type_index',
 'member_gender_index',
 'month_index',
 'hour_index',
 'age_range_index']

In [107]:
outputCols = [colname+'_vector'for colname in goBike_columns]

In [108]:
outputCols

['start_station_latitude_vector',
 'start_station_longitude_vector',
 'user_type_vector',
 'member_gender_vector',
 'month_vector',
 'hour_vector',
 'age_range_vector']

In [109]:
OHencoder = feature.OneHotEncoderEstimator(inputCols=inputCols
                                           , outputCols=outputCols)

In [110]:
goBike_proc = OHencoder.fit(goBike_proc).transform(goBike_proc)

In [111]:
goBike_proc.printSchema()

root
 |-- label: integer (nullable = true)
 |-- start_station_latitude: double (nullable = true)
 |-- start_station_longitude: double (nullable = true)
 |-- user_type: string (nullable = true)
 |-- member_gender: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- age_range: string (nullable = false)
 |-- start_station_latitude_index: double (nullable = false)
 |-- start_station_longitude_index: double (nullable = false)
 |-- user_type_index: double (nullable = false)
 |-- member_gender_index: double (nullable = false)
 |-- month_index: double (nullable = false)
 |-- hour_index: double (nullable = false)
 |-- age_range_index: double (nullable = false)
 |-- member_gender_vector: vector (nullable = true)
 |-- start_station_longitude_vector: vector (nullable = true)
 |-- start_station_latitude_vector: vector (nullable = true)
 |-- hour_vector: vector (nullable = true)
 |-- user_type_vector: vector (nullable = true)
 |-- month_vector: ve

In [113]:
vectAssembler = feature.VectorAssembler(inputCols=outputCols, outputCol = "features")
goBike_proc = vectAssembler.transform(goBike_proc)
goBike_proc.printSchema()

root
 |-- label: integer (nullable = true)
 |-- start_station_latitude: double (nullable = true)
 |-- start_station_longitude: double (nullable = true)
 |-- user_type: string (nullable = true)
 |-- member_gender: string (nullable = true)
 |-- month: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- age_range: string (nullable = false)
 |-- start_station_latitude_index: double (nullable = false)
 |-- start_station_longitude_index: double (nullable = false)
 |-- user_type_index: double (nullable = false)
 |-- member_gender_index: double (nullable = false)
 |-- month_index: double (nullable = false)
 |-- hour_index: double (nullable = false)
 |-- age_range_index: double (nullable = false)
 |-- member_gender_vector: vector (nullable = true)
 |-- start_station_longitude_vector: vector (nullable = true)
 |-- start_station_latitude_vector: vector (nullable = true)
 |-- hour_vector: vector (nullable = true)
 |-- user_type_vector: vector (nullable = true)
 |-- month_vector: ve

In [114]:
goBike_proc = goBike_proc.select("label", "features")
goBike_proc.show(truncate=False)

+-----+-----------------------------------------------------+
|label|features                                             |
+-----+-----------------------------------------------------+
|80110|(42,[0,4,8,13,20,39],[1.0,1.0,1.0,1.0,1.0,1.0])      |
|78800|(42,[0,4,9,13,23,40],[1.0,1.0,1.0,1.0,1.0,1.0])      |
|43603|(42,[2,5,7,9,13,28,39],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|4507 |(42,[2,5,9,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0])      |
|2183 |(42,[0,4,7,8,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|2170 |(42,[0,4,7,8,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1544 |(42,[0,4,7,9,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1474 |(42,[0,4,7,8,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|1532 |(42,[0,4,7,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0])      |
|1216 |(42,[0,4,7,8,13,33,40],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|386  |(42,[1,6,7,8,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|422  |(42,[0,4,7,8,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|871  |(42,[0,4,7,8,13,33,39],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|733  |(

### Klasyfikacja

In [115]:
from pyspark.ml import classification

#### Dane

https://archive.ics.uci.edu/ml/datasets/adult

In [116]:
col_names = ["age", "workclass", "fnlwgt", "education", "education-num","marital-status", "occupation", 
             "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", 
             "native-country", "earnings"]

In [117]:
df = spark.read.csv("./adult.data", header=False, inferSchema=True)

In [118]:
df = df.select(*[f.col(old).alias(new) for old, new in zip(df.columns, col_names)]).drop("fnlwgt").dropna("any")

In [120]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- earnings: string (nullable = true)



> **ZADANIE:**
- przygotuj dane
- na podstawie kolumny `earnings` stwórz zmienną celu `label` z wartościami zakodowanymi jako 0 i 1
- stwórz (znormalizowaną -> L2) kolumnę `features` zawierającą wektory powstałe na podstawie pozostałych kolumn
- wynikowy DataFrame nazwij `df_processed`

In [121]:
indexer = feature.StringIndexer(inputCol='earnings',outputCol='label')
idx_model = indexer.fit(df)
df_processed = idx_model.transform(df)
df_processed.select('earnings','label').show()

+--------+-----+
|earnings|label|
+--------+-----+
|   <=50K|  0.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|    >50K|  1.0|
|    >50K|  1.0|
|    >50K|  1.0|
|    >50K|  1.0|
|    >50K|  1.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|    >50K|  1.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|   <=50K|  0.0|
|    >50K|  1.0|
+--------+-----+
only showing top 20 rows



In [123]:
num_cols = [c for c,t in df.dtypes if t!='string']

In [124]:
num_cols

['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

In [125]:
categ_cols = [c for c,t in df.dtypes if t=='string' and c!= 'earnings']

In [126]:
categ_cols

['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country']

In [127]:
categ_cols_out = [c + 'Idx' for c in categ_cols]

In [128]:
categ_cols_out

['workclassIdx',
 'educationIdx',
 'marital-statusIdx',
 'occupationIdx',
 'relationshipIdx',
 'raceIdx',
 'sexIdx',
 'native-countryIdx']

In [130]:
categ_cols_vect = [c+'Vect' for c in categ_cols]
categ_cols_vect

['workclassVect',
 'educationVect',
 'marital-statusVect',
 'occupationVect',
 'relationshipVect',
 'raceVect',
 'sexVect',
 'native-countryVect']

In [131]:
for o, n in zip(categ_cols,categ_cols_out):
    indexer = feature.StringIndexer(inputCol=o,outputCol=n)
    df_processed = indexer.fit(df_processed).transform(df_processed)
    
df_processed.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- earnings: string (nullable = true)
 |-- label: double (nullable = false)
 |-- workclassIdx: double (nullable = false)
 |-- educationIdx: double (nullable = false)
 |-- marital-statusIdx: double (nullable = false)
 |-- occupationIdx: double (nullable = false)
 |-- relationshipIdx: double (nullable = false)
 |-- raceIdx: double (nullable = false)
 |-- sexIdx: double (nullable = false)
 |-- native-countryIdx: double (nullable = false)



In [132]:
one_hot = feature.OneHotEncoderEstimator(inputCols=categ_cols_out,outputCols=categ_cols_vect)
one_hot_model = one_hot.fit(df_processed)
df_processed = one_hot_model.transform(df_processed)

In [133]:
vectAssembler = feature.VectorAssembler(inputCols=num_cols + categ_cols_vect, outputCol = "featuresRaw")
df_processed = vectAssembler.transform(df_processed)
df_processed.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- earnings: string (nullable = true)
 |-- label: double (nullable = false)
 |-- workclassIdx: double (nullable = false)
 |-- educationIdx: double (nullable = false)
 |-- marital-statusIdx: double (nullable = false)
 |-- occupationIdx: double (nullable = false)
 |-- relationshipIdx: double (nullable = false)
 |-- raceIdx: double (nullable = false)
 |-- sexIdx: double (nullable = false)
 |-- native-countryIdx: double (nullable = false)
 |-- sex

In [134]:
df_processed = df_processed.select('label','featuresRaw')

In [135]:
df_processed.show(truncate=False)

+-----+---------------------------------------------------------------------------------------------+
|label|featuresRaw                                                                                  |
+-----+---------------------------------------------------------------------------------------------+
|0.0  |(99,[0,1,2,4,9,15,29,37,49,53,57,58],[39.0,13.0,2174.0,40.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|0.0  |(99,[0,1,4,6,15,28,36,48,53,57,58],[50.0,13.0,13.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])         |
|0.0  |(99,[0,1,4,5,13,30,43,49,53,57,58],[38.0,9.0,40.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])          |
|0.0  |(99,[0,1,4,5,18,28,43,48,54,57,58],[53.0,7.0,40.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])          |
|0.0  |(99,[0,1,4,5,15,28,34,52,54,67],[28.0,13.0,40.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                |
|0.0  |(99,[0,1,4,5,16,28,36,52,53,58],[37.0,14.0,40.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                |
|0.0  |(99,[0,1,4,5,23,33,39,49,54,69],[49.0,5.0,16.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]

In [136]:
normalizerL2 = feature.Normalizer(inputCol='featuresRaw',outputCol='features')
df_processed = normalizerL2.transform(df_processed)

**Ostatnie przygotowania**

In [137]:
df_processed.select("label", "features")
df_processed.cache()

DataFrame[label: double, featuresRaw: vector, features: vector]

In [138]:
df_train, df_eval = df_processed.randomSplit([0.7, 0.3], 42)
df_train.cache()
df_eval.cache()

DataFrame[label: double, featuresRaw: vector, features: vector]

In [139]:
print("Train:")
df_train.groupBy("label").count().show()
print("Eval:")
df_eval.groupBy("label").count().show()

Train:
+-----+-----+
|label|count|
+-----+-----+
|  0.0|17325|
|  1.0| 5516|
+-----+-----+

Eval:
+-----+-----+
|label|count|
+-----+-----+
|  0.0| 7395|
|  1.0| 2325|
+-----+-----+



#### Regresja logistyczna

In [140]:
lr = classification.LogisticRegression(maxIter=1000)

In [141]:
lrModel = lr.fit(df_train)

In [142]:
lrModel.coefficients

DenseVector([3.8701, 22.8622, 3.8378, 3.2818, 3.9356, -413.8915, -448.0723, -422.1727, 443.9573, -440.4298, -403.2291, -384.543, -25200.0653, 110.9606, 112.229, 86.6474, 86.8704, 102.4181, 92.9056, 69.4976, 125.9191, 145.5306, 105.5334, 132.1441, 120.3408, 102.4612, 162.5475, 191.279, 9.4367, -178.9696, -157.7918, -160.4869, -156.7689, -155.3884, 473.421, 435.0839, 485.0137, 438.3334, 455.6402, 387.3625, 413.1016, -452.5795, 434.9833, 389.2688, 359.6863, 476.3198, 476.573, -10657.3172, 25.814, 93.1622, 21.9667, 81.7868, 130.3396, 17.8532, 3.627, 32.3728, -8.9033, 60.7767, -638.2647, -673.1128, -662.9303, -642.1916, -615.2671, -606.4367, -733.2528, -683.5895, -701.0137, -608.9073, -619.0003, -617.3822, -745.3588, -678.7885, -588.4091, -766.985, -695.1488, -660.9015, -608.4271, -632.8563, -844.9404, -654.752, -614.6835, -690.134, -630.426, -711.624, -702.6153, -647.8128, -757.6483, -642.273, -608.7775, -676.6878, -671.0851, -536.749, -698.123, -672.4827, -548.5812, -1294.543, -1027.7077,

In [143]:
lrModel.intercept

-3.3305016364302653

In [144]:
trainingSummary = lrModel.summary
type(trainingSummary)

pyspark.ml.classification.BinaryLogisticRegressionTrainingSummary

In [146]:
trainingSummary.roc.show(12)

+--------------------+-------------------+
|                 FPR|                TPR|
+--------------------+-------------------+
|                 0.0|                0.0|
|0.002366522366522...|0.03208846990572879|
|0.004329004329004329| 0.0654459753444525|
|0.006695526695526...|0.09934735315445975|
|0.010158730158730159| 0.1312545322697607|
|0.013852813852813853| 0.1646120377084844|
|0.018066378066378067|0.19398114575779551|
|0.023896103896103898|0.21773023930384336|
|0.030591630591630593|0.23477157360406092|
| 0.03544011544011544|0.25725163161711384|
| 0.03913419913419913|0.28263234227701234|
| 0.04242424242424243|0.30982596084118924|
+--------------------+-------------------+
only showing top 12 rows



In [164]:
import pandas as pd

In [160]:
roc = trainingSummary.roc.toPandas()
roc.head()

In [147]:
trainingSummary.pr.show(12)

+-------------------+------------------+
|             recall|         precision|
+-------------------+------------------+
|                0.0|0.8119266055045872|
|0.03208846990572879|0.8119266055045872|
| 0.0654459753444525|0.8279816513761468|
|0.09934735315445975|0.8253012048192772|
| 0.1312545322697607|0.8044444444444444|
| 0.1646120377084844|0.7909407665505227|
|0.19398114575779551|0.7736804049168474|
|0.21773023930384336|0.7436532507739938|
|0.23477157360406092|0.7095890410958904|
|0.25725163161711384|0.6979832759468766|
|0.28263234227701234|0.6969155118462226|
|0.30982596084118924|0.6992635024549918|
+-------------------+------------------+
only showing top 12 rows



In [148]:
trainingSummary.areaUnderROC

0.8859564619571872

In [149]:
trainingSummary.accuracy

0.8330633509916379

In [157]:
trainingSummary.predictions.show()

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|label|         featuresRaw|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5326152931958...|[0.36990711925322...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5422916390315...|[0.36765464983348...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5369032760625...|[0.36890825276866...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5328495625693...|[0.36985251838594...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5301023238588...|[0.37049302293326...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5244270557529...|[0.37181762404802...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5457358633782...|[0.

In [158]:
# robienie predykcji
lrModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|label|         featuresRaw|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5550248183449...|[0.36469940040252...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5497673793233...|[0.36591838044796...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5146135736853...|[0.37411262046736...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5406064354740...|[0.36804652119683...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5359201849731...|[0.36913716055732...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5296713361219...|[0.37059354692584...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5399712276195...|[0.

#### SVM

In [165]:
svm = classification.LinearSVC(maxIter=1000)

In [166]:
svmModel = svm.fit(df_train)

In [167]:
svmModel.coefficients

DenseVector([0.5464, -2.25, 0.9865, 0.9352, 0.9094, -18.2998, -47.7782, -23.2682, -20.7003, -36.9507, -9.8468, -5.9827, -9115.8618, -32.3725, -13.4524, 41.5331, 51.4585, -1.0196, -56.5471, -6.5672, -60.3234, -71.1134, 71.3113, -59.8117, -45.0727, 80.8709, -77.5236, -71.6502, 9.2555, -100.7607, -82.9508, -86.004, -72.6694, -84.2539, 8.0431, -40.1744, 15.9848, -26.0134, -18.3245, -58.4196, -48.1061, -53.9702, -42.5604, -55.7703, -83.724, 16.2717, -13.3639, -10034.93, 2.1817, 8.9888, -4.9709, 2.0847, 42.7932, -17.6904, -20.6371, -10.4557, -44.5836, 23.5189, -30.139, -59.4584, -47.6346, -28.9485, -12.2711, 3.7366, -97.9839, -76.3819, -74.2153, -27.3692, -29.5461, -7.62, -141.7936, -57.6334, -6.0892, -129.0479, -62.9243, -35.3484, -29.9881, -34.882, -193.6455, -43.2119, -6.6556, -57.6113, -24.515, -80.2634, -70.3928, -29.9802, -105.3916, -38.16, -29.176, -42.8359, -61.0756, 20.4857, -59.2, -37.893, 45.2887, -42.9918, -4.4866, -62.678, -46.7934])

In [168]:
svmModel.intercept

0.012549090354290436

In [169]:
# robienie predykcji
svmModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+----------+
|label|         featuresRaw|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9975813959826...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9969645450623...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9774627603486...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9770307564409...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9863595140940...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9722553244195...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9827942816762...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9750923285196...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.9989263543336...|       1.0|
|  0.0|(99,[0,1,

#### Drzewo decyzyjne

In [170]:
tree = classification.DecisionTreeClassifier()

In [171]:
treeModel = tree.fit(df_train)

In [172]:
treeModel.depth

5

In [173]:
treeModel.numNodes

51

In [174]:
print(treeModel.toDebugString)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4edc941bea9e320cb377) of depth 5 with 51 nodes
  If (feature 28 <= 5.0000477491839864E-6)
   If (feature 2 <= 0.9999686642218393)
    If (feature 58 <= 0.017105892635409112)
     If (feature 36 <= 5.000048478205031E-6)
      If (feature 34 <= 5.000047925439046E-6)
       Predict: 0.0
      Else (feature 34 > 5.000047925439046E-6)
       Predict: 0.0
     Else (feature 36 > 5.000048478205031E-6)
      If (feature 16 <= 5.0000477491839864E-6)
       Predict: 0.0
      Else (feature 16 > 5.0000477491839864E-6)
       Predict: 0.0
    Else (feature 58 > 0.017105892635409112)
     If (feature 58 <= 0.01889653676089597)
      If (feature 1 <= 0.25946625563059866)
       Predict: 0.0
      Else (feature 1 > 0.25946625563059866)
       Predict: 0.0
     Else (feature 58 > 0.01889653676089597)
      If (feature 52 <= 5.000048809214692E-6)
       Predict: 0.0
      Else (feature 52 > 5.000048809214692E-6)
       Predict: 0.0
   Else (fe

In [175]:
# robienie predykcji
treeModel.transform(df_eval).show()

+-----+--------------------+--------------------+---------------+--------------------+----------+
|label|         featuresRaw|            features|  rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+---------------+--------------------+----------+
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|  [291.0,778.0]|[0.27221702525724...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|  [291.0,778.0]|[0.27221702525724...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[3312.0,1906.0]|[0.63472594863932...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[3312.0,1906.0]|[0.63472594863932...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[3312.0,1906.0]|[0.63472594863932...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[3312.0,1906.0]|[0.63472594863932...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[3312.0,1906.0]|[0.63472594863932...|       0.0|
|  0.0|(99,[0,1,2,4,

#### Las losowy

In [176]:
forest = classification.RandomForestClassifier()

In [177]:
forestModel = forest.fit(df_train)

In [178]:
forestModel.featureImportances

SparseVector(99, {0: 0.0631, 1: 0.0518, 2: 0.0895, 3: 0.0101, 4: 0.0791, 5: 0.0568, 6: 0.0004, 7: 0.0, 8: 0.001, 9: 0.0001, 10: 0.0001, 11: 0.0002, 13: 0.0045, 14: 0.0102, 15: 0.0424, 16: 0.0117, 17: 0.0002, 18: 0.0006, 19: 0.0, 20: 0.0, 22: 0.0066, 23: 0.0009, 25: 0.0036, 27: 0.0001, 28: 0.2122, 29: 0.0522, 30: 0.005, 31: 0.0003, 32: 0.0003, 33: 0.0002, 34: 0.0156, 35: 0.0009, 36: 0.0156, 37: 0.0009, 38: 0.0016, 39: 0.0007, 40: 0.0007, 41: 0.0003, 42: 0.0005, 43: 0.0002, 44: 0.0007, 45: 0.0, 46: 0.0001, 47: 0.0, 48: 0.0487, 49: 0.0583, 50: 0.0579, 51: 0.0051, 52: 0.0098, 53: 0.0345, 54: 0.001, 55: 0.0, 57: 0.0201, 58: 0.0224, 59: 0.0004, 61: 0.0001, 63: 0.0001, 68: 0.0001, 70: 0.0, 72: 0.0001, 77: 0.0, 79: 0.0001})

In [179]:
print(forestModel.toDebugString)

RandomForestClassificationModel (uid=RandomForestClassifier_4f9c8d0589e5bf86c865) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 1 <= 0.008381356259882379)
     If (feature 53 <= 1.3701970608427918E-4)
      If (feature 4 <= 0.01011784535495797)
       If (feature 31 <= 1.310428424916332E-4)
        If (feature 58 <= 2.2797597737226661E-4)
         Predict: 1.0
        Else (feature 58 > 2.2797597737226661E-4)
         Predict: 0.0
       Else (feature 31 > 1.310428424916332E-4)
        Predict: 0.0
      Else (feature 4 > 0.01011784535495797)
       If (feature 61 <= 5.050827408727963E-4)
        If (feature 1 <= 0.00506405620468612)
         Predict: 0.0
        Else (feature 1 > 0.00506405620468612)
         Predict: 1.0
       Else (feature 61 > 5.050827408727963E-4)
        Predict: 1.0
     Else (feature 53 > 1.3701970608427918E-4)
      If (feature 1 <= 0.00506405620468612)
       If (feature 30 <= 5.000048490955403E-6)
        If (feature 29 <= 5.000048109194339E-6)
     

In [180]:
# robienie predykcji
forestModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|label|         featuresRaw|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[8.71612484810445...|[0.43580624240522...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[11.3447370731309...|[0.56723685365654...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[11.0264036521381...|[0.55132018260690...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[12.5339123728945...|[0.62669561864472...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[7.16539506575374...|[0.35826975328768...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[13.5000416785361...|[0.67500208392680...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[10.6340571708955...|[0.

#### Naiwny Bayes

In [181]:
bayes = classification.NaiveBayes()

In [182]:
bayesModel = bayes.fit(df_train)

In [183]:
# robienie predykcji
bayesModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|label|         featuresRaw|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-3.9875780822150...|[0.35339109611658...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-3.9880889064113...|[0.35331502703991...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-3.9863076333199...|[0.35327313050075...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-3.9984933850906...|[0.35372166489299...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-3.9846414456152...|[0.35331132700155...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-4.0013894679563...|[0.35381887987975...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-3.9919795325544...|[0.

#### MLP

In [184]:
mlp = classification.MultilayerPerceptronClassifier(maxIter=1000, layers=[99,40,2])

In [185]:
mlpModel = mlp.fit(df_train)

In [186]:
mlpModel.layers

[99, 40, 2]

In [189]:
mlpModel.weights[:10]

array([-9.37342176,  5.24022958, -0.2048186 ,  1.90923399, 16.19743984,
       -4.04024183,  3.92450798,  9.87348124, -5.88395962,  4.69120601])

In [188]:
# robienie predykcji
mlpModel.transform(df_eval).show()

+-----+--------------------+--------------------+--------------------+--------------------+----------+
|label|         featuresRaw|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+----------+
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.5811421300434...|[0.47972066233029...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.7583820434175...|[0.38838489597416...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.8488801900349...|[0.34240341682730...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[0.05131542428461...|[0.76615027382497...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.7490751440012...|[0.39331208214607...|       1.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[0.14573002303140...|[0.79747595395275...|       0.0|
|  0.0|(99,[0,1,2,4,5,13...|(99,[0,1,2,4,5,13...|[-0.3183341342204...|[0.

#### Ewaluacja

In [190]:
from pyspark.ml import evaluation

In [191]:
evaluator = evaluation.BinaryClassificationEvaluator()

In [192]:
# AUC - regresja
evaluator.evaluate(lrModel.transform(df_eval))

0.8806548161718887

In [193]:
# AUC - SVM
evaluator.evaluate(svmModel.transform(df_eval))

0.8783956320384639

In [194]:
# AUC - drzewo decyzyjne
evaluator.evaluate(treeModel.transform(df_eval))

0.7414201109438955

In [195]:
# AUC - las losowy
evaluator.evaluate(forestModel.transform(df_eval))

0.8864271558085397

In [196]:
# AUC - NB
evaluator.evaluate(bayesModel.transform(df_eval))

0.6897198775691118

In [197]:
# AUC - MLP
evaluator.evaluate(mlpModel.transform(df_eval))

0.9006790406188291

> **ZADANIE:**
- napisz funkcję do obliczania `accuracy`
- oblicz `accuracy` powyższych modeli

In [217]:
def calculate_acc(df,label='label',predictions='prediction'):
    temp = df.select(f.when(df[label] == df[predictions],1).otherwise(0).alias('same'))
    return temp.select(f.sum('same') / f.count('same')).collect()[0][0]

In [218]:
calculate_acc(lrModel.transform(df_eval))

0.828395061728395

In [219]:
calculate_acc(svmModel.transform(df_eval))

0.8165637860082304

In [220]:
calculate_acc(treeModel.transform(df_eval))

0.8326131687242798

In [223]:
calculate_acc(forestModel.transform(df_eval))

0.81440329218107

In [221]:
calculate_acc(bayesModel.transform(df_eval))

0.779835390946502

In [222]:
calculate_acc(mlpModel.transform(df_eval))

0.8507201646090535

> **ZADANIE:**
- popraw `accuracy` dwóch modeli

In [231]:
tree2 = classification.DecisionTreeClassifier(maxDepth=20)
tree2Model = tree2.fit(df_eval)
calculate_acc(tree2Model.transform(df_eval))

0.9565843621399177

### Regresja

In [232]:
from pyspark.ml import regression

https://archive.ics.uci.edu/ml/datasets/wine+quality

In [233]:
wine_red = spark.read.csv("./winequality-red.csv", header=True, inferSchema=True, sep=";") \
.withColumn("type", f.lit(0))
wine_white = spark.read.csv("./winequality-white.csv", header=True, inferSchema=True, sep=";") \
.withColumn("type", f.lit(1))

In [234]:
wine = wine_red.union(wine_white)

In [235]:
wine.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)
 |-- type: integer (nullable = false)



> **ZADANIE:**
- przygotuj dane
- usuń wiersze zawierające braki danych
- zmień nazwę kolumny `quality` na `label`
- z pozostałych zmiennych stwórz (znormalizowaną -> L2) kolumnę `features` zawierającą wektory
- wynikowemu DataFrameowi nadaj nazwę `wine_processed`

In [237]:
wine_proc = wine.dropna()

In [238]:
wine_proc = wine_proc.withColumnRenamed('quality','label')

In [239]:
to_features = [x for x in wine_proc.columns if x!='label']

In [240]:
to_features

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'type']

In [241]:
vectAssembler = feature.VectorAssembler(inputCols=to_features,outputCol='featuresRaw')
wine_proc = vectAssembler.transform(wine_proc)

In [242]:
wine_proc.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- type: integer (nullable = false)
 |-- featuresRaw: vector (nullable = true)



In [243]:
wine_proc = wine_proc.select('label','featuresRaw')

In [245]:
wine_proc.show(truncate=False)

+-----+-------------------------------------------------------------+
|label|featuresRaw                                                  |
+-----+-------------------------------------------------------------+
|5    |[7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0.0]   |
|5    |[7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0.0]   |
|5    |[7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0.0]  |
|6    |[11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,0.0] |
|5    |[7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0.0]   |
|5    |[7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,0.0]  |
|5    |[7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,0.0]   |
|7    |[7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,0.0] |
|7    |[7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,0.0]  |
|5    |[7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,0.0] |
|5    |[6.7,0.58,0.08,1.8,0.097,15.0,65.0,0.9959,3.28,0.54,9.2,0.0] |
|5    |[7.5,0.5,0.36

In [246]:
normalizerL2 = feature.Normalizer(inputCol='featuresRaw',outputCol='features')
wine_proc = normalizerL2.transform(wine_proc)

**Ostatnie przygotowania**

In [248]:
wine_proc = wine_proc.select("label", "features")
wine_proc.cache()

DataFrame[label: int, features: vector]

In [249]:
wine_train, wine_eval = wine_proc.randomSplit([0.7, 0.3], 42)
wine_train.cache()
wine_eval.cache()

DataFrame[label: int, features: vector]

In [250]:
print("Train:")
wine_train.describe("label").show()
print("Eval:")
wine_eval.describe("label").show()

Train:
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              4517|
|   mean| 5.823555457161833|
| stddev|0.8707903543967717|
|    min|                 3|
|    max|                 9|
+-------+------------------+

Eval:
+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|              1980|
|   mean| 5.806565656565657|
| stddev|0.8789592145780594|
|    min|                 3|
|    max|                 9|
+-------+------------------+



#### Regresja liniowa

In [251]:
reg = regression.LinearRegression(maxIter=500)

In [252]:
regModel = reg.fit(wine_train)

In [253]:
regModel.coefficients

DenseVector([3.8418, -56.3036, -24.2871, 0.9227, -8.3934, 1.8789, 2.2861, -216.6933, 11.7471, 35.7788, 15.7523, 14.5111])

In [254]:
regModel.intercept

2.9791554291006177

In [255]:
trainSummary = regModel.summary
type(trainSummary)

pyspark.ml.regression.LinearRegressionTrainingSummary

In [256]:
trainSummary.meanAbsoluteError

0.6070159588661677

In [257]:
trainSummary.meanSquaredError

0.6100217545957918

In [258]:
trainSummary.r2

0.1953365761377699

In [259]:
# predykcje
regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|    3|[0.17139301481654...| 6.091566856245242|
|    3|[0.19204691859334...| 4.999511709938429|
|    3|[0.23007240776161...|  5.10141394260246|
|    4|[0.06855000973539...| 6.061041407353274|
|    4|[0.10919117310000...| 5.199518645603752|
|    4|[0.17294433939616...| 5.298011408448441|
|    4|[0.17939070456973...| 5.018712122759601|
|    4|[0.18015969919215...| 6.116064734248514|
|    4|[0.21575017067129...| 5.841256084907588|
|    4|[0.22889809493949...|4.7550460461408735|
|    4|[0.37968825800476...| 4.525409230081105|
|    4|[0.38380547348192...| 5.378271079214495|
|    4|[0.40082064094380...| 5.102049588099196|
|    4|[0.42005797449029...| 4.157118623588285|
|    4|[0.44196806195907...| 3.885127237056893|
|    5|[0.04013119353917...| 5.787728679803472|
|    5|[0.04641692699571...| 5.793069088194853|
|    5|[0.05155454668014...| 5.452729064

#### Drzewo regresyjne

In [262]:
tree_reg = regression.DecisionTreeRegressor()

In [263]:
tree_regModel = tree_reg.fit(wine_train)

In [264]:
print(tree_regModel.toDebugString)

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_436b9ad98cb5bc744f87) of depth 5 with 63 nodes
  If (feature 11 <= 0.005981100769664005)
   If (feature 2 <= 0.01038735168085701)
    If (feature 1 <= 0.0012625764093651968)
     If (feature 0 <= 0.029903597626480302)
      If (feature 9 <= 0.0036444818927591762)
       Predict: 5.484848484848484
      Else (feature 9 > 0.0036444818927591762)
       Predict: 7.5
     Else (feature 0 > 0.029903597626480302)
      If (feature 10 <= 0.042667593985501885)
       Predict: 6.580645161290323
      Else (feature 10 > 0.042667593985501885)
       Predict: 5.983539094650205
    Else (feature 1 > 0.0012625764093651968)
     If (feature 5 <= 0.3844948325343825)
      If (feature 4 <= 2.0048726168222158E-4)
       Predict: 5.849056603773585
      Else (feature 4 > 2.0048726168222158E-4)
       Predict: 5.375444839857651
     Else (feature 5 > 0.3844948325343825)
      If (feature 2 <= 0.005587119265081934)
       Predict: 5.623809523809523
    

In [265]:
# predykcje
tree_regModel.transform(wine_eval).show()

+-----+--------------------+-----------------+
|label|            features|       prediction|
+-----+--------------------+-----------------+
|    3|[0.17139301481654...|6.190476190476191|
|    3|[0.19204691859334...|5.623809523809523|
|    3|[0.23007240776161...|              5.2|
|    4|[0.06855000973539...|5.375444839857651|
|    4|[0.10919117310000...|5.375444839857651|
|    4|[0.17294433939616...|5.375444839857651|
|    4|[0.17939070456973...|5.375444839857651|
|    4|[0.18015969919215...|6.190476190476191|
|    4|[0.21575017067129...|6.190476190476191|
|    4|[0.22889809493949...|5.375444839857651|
|    4|[0.37968825800476...|5.375444839857651|
|    4|[0.38380547348192...|5.764705882352941|
|    4|[0.40082064094380...|5.375444839857651|
|    4|[0.42005797449029...|5.375444839857651|
|    4|[0.44196806195907...|5.973684210526316|
|    5|[0.04013119353917...|5.623809523809523|
|    5|[0.04641692699571...|5.623809523809523|
|    5|[0.05155454668014...|5.375444839857651|
|    5|[0.053

#### Las regresyjny

In [266]:
forest_reg = regression.RandomForestRegressor()

In [267]:
forest_regModel = forest_reg.fit(wine_train)

In [268]:
forest_regModel.featureImportances

SparseVector(12, {0: 0.0374, 1: 0.1596, 2: 0.0975, 3: 0.0387, 4: 0.1463, 5: 0.1061, 6: 0.0881, 7: 0.031, 8: 0.0365, 9: 0.0296, 10: 0.1022, 11: 0.127})

In [269]:
print(forest_regModel.toDebugString)

RandomForestRegressionModel (uid=RandomForestRegressor_4abbb37429e7d0be6a74) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 4 <= 4.5712570706627864E-4)
     If (feature 7 <= 0.0061386172083658114)
      If (feature 1 <= 0.0012625764093651968)
       If (feature 7 <= 0.004499396722741482)
        If (feature 11 <= 9.495152315151297E-4)
         Predict: 7.0
        Else (feature 11 > 9.495152315151297E-4)
         Predict: 5.551020408163265
       Else (feature 7 > 0.004499396722741482)
        If (feature 11 <= 0.004552962709769541)
         Predict: 6.769230769230769
        Else (feature 11 > 0.004552962709769541)
         Predict: 6.016949152542373
      Else (feature 1 > 0.0012625764093651968)
       If (feature 8 <= 0.0204315793657856)
        If (feature 1 <= 0.00309988206516071)
         Predict: 5.5200594353640415
        Else (feature 1 > 0.00309988206516071)
         Predict: 4.583333333333333
       Else (feature 8 > 0.0204315793657856)
        If (feature 0 <= 0.03749

In [270]:
# predykcje
forest_regModel.transform(wine_eval).show()

+-----+--------------------+------------------+
|label|            features|        prediction|
+-----+--------------------+------------------+
|    3|[0.17139301481654...|  5.87573522854329|
|    3|[0.19204691859334...| 5.573961877470248|
|    3|[0.23007240776161...| 5.643094034704598|
|    4|[0.06855000973539...|  5.54376955856807|
|    4|[0.10919117310000...| 5.336567183974165|
|    4|[0.17294433939616...| 5.601295221610741|
|    4|[0.17939070456973...| 5.570312375716478|
|    4|[0.18015969919215...|  5.87573522854329|
|    4|[0.21575017067129...| 5.882569123955524|
|    4|[0.22889809493949...| 5.606756612638398|
|    4|[0.37968825800476...| 5.418331137587314|
|    4|[0.38380547348192...| 5.859575702446252|
|    4|[0.40082064094380...| 5.340488708043235|
|    4|[0.42005797449029...| 5.168782279392363|
|    4|[0.44196806195907...| 5.609951964410016|
|    5|[0.04013119353917...| 5.862683710805095|
|    5|[0.04641692699571...|  5.41776605502368|
|    5|[0.05155454668014...|5.3081961471

#### Ewaluacja

In [275]:
evaluator_reg = evaluation.RegressionEvaluator()

In [272]:
# rmse - regresja
evaluator_reg.evaluate(regModel.transform(wine_eval))

0.8018337270809242

In [273]:
# rmse - drzewo
evaluator_reg.evaluate(tree_regModel.transform(wine_eval))

0.7989242424919465

In [274]:
# rmse - las
evaluator_reg.evaluate(forest_regModel.transform(wine_eval))

0.7691585204568716

> **ZADANIE:**
- oblicz `MSE` oraz `R^2` powyższych modeli

In [278]:
eval_mse = evaluation.RegressionEvaluator(metricName='mse')

In [279]:
eval_mse.evaluate(regModel.transform(wine_eval))

0.642937325884486

In [282]:
eval_r2 = evaluation.RegressionEvaluator(metricName='r2')

In [283]:
eval_r2.evaluate(regModel.transform(wine_eval))

0.16737281038223728

In [284]:
eval_r2.evaluate(tree_regModel.transform(wine_eval))

0.17340428747925596

In [286]:
eval_r2.evaluate(forest_regModel.transform(wine_eval))

0.23385025756696087

> **ZADANIE:**
- popraw `R^2` jednego modelu

In [288]:
forestreg2 = regression.RandomForestRegressor(maxDepth=10,numTrees=50)
forestModel = forestreg2.fit(wine_eval)

In [289]:
eval_r2.evaluate(forestModel.transform(wine_eval))

0.7020642261059933

### Wybór najlepszych parametrów

In [290]:
from pyspark.ml import tuning

In [291]:
reg2 = regression.LinearRegression()

In [292]:
grid = tuning.ParamGridBuilder() \
.addGrid(reg2.maxIter, [100, 500, 1000]) \
.addGrid(reg2.regParam, [0.0, 0.1, 0.2]).build()

In [293]:
reg_eval = evaluation.RegressionEvaluator()

In [294]:
cv = tuning.CrossValidator(estimator=reg2, estimatorParamMaps=grid, evaluator=reg_eval, parallelism=2)

In [295]:
cvModel = cv.fit(wine_train)

In [323]:
paramRef = cvModel.bestModel.getParam('regParam')

In [325]:
cvModel.bestModel.getOrDefault(paramRef)

0.0

In [296]:
cvModel.avgMetrics

[0.7848730693485754,
 0.8170377287842591,
 0.8242241130428464,
 0.7848730693485754,
 0.8170377287842591,
 0.8242241130428464,
 0.7848730693485754,
 0.8170377287842591,
 0.8242241130428464]

In [297]:
reg_eval.evaluate(cvModel.transform(wine_eval))

0.8018337270809242

### Pipeline

- Transformer - algorytm przekształcający wejściowy DataFrame w inny DataFrame, np. model tworzący nowy DF zawierający predykcje (transform)
- Estymator - algorytm który na podstawie DataFrameu tworzy transformer (fit)

> Przykład:

>**forest_reg = regression.RandomForestRegressor()**    <- *utworzenie estymatora*

>**forest_regModel = forest_reg.fit(wine_train)**   <- *wywołanie metody `fit` estymatora w celu utworzenia transformera*

>**forest_regModel.transform(wine_eval).show()**  <- *wywołanie metody `transform` transformera w celu wygenerowania nowego DF*

- Pipeline - szeregowe połączenie transformerów i estymatorów w celu utworzenia przepływu (workflow)

In [312]:
from pyspark.ml import Pipeline

In [313]:
# przygotowanie estymatorów/transformerów
cols = [x for x in wine_red.columns if x != "quality"]
vectA = feature.VectorAssembler(inputCols = cols, outputCol = "featuresRaw") # transformer
normL2 = feature.Normalizer(inputCol="featuresRaw", outputCol="features") # estymator
forestReg = regression.RandomForestRegressor(labelCol="quality") # estymator

In [316]:
cols

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'type']

In [314]:
# utworzenie estymatora
pipeline = Pipeline(stages=[vectA, normL2, forestReg])

In [315]:
# utworzenie transformera
pipelineModel = pipeline.fit(wine_red)

In [317]:
# transformacja (predykcja)
pipelineModel.transform(wine_white).drop(*cols).show()

+-------+--------------------+--------------------+------------------+
|quality|         featuresRaw|            features|        prediction|
+-------+--------------------+--------------------+------------------+
|      6|[7.0,0.27,0.36,20...|[0.03944595512349...| 5.431161966795079|
|      6|[6.3,0.3,0.34,1.6...|[0.04726589062278...| 5.325026738315898|
|      6|[8.1,0.28,0.4,6.9...|[0.07890794092680...| 5.645205586036605|
|      6|[7.2,0.23,0.32,8....|[0.03741101012377...| 5.439273599554599|
|      6|[7.2,0.23,0.32,8....|[0.03741101012377...| 5.439273599554599|
|      6|[8.1,0.28,0.4,6.9...|[0.07890794092680...| 5.645205586036605|
|      6|[6.2,0.32,0.16,7....|[0.04429921726352...| 5.439273599554599|
|      6|[7.0,0.27,0.36,20...|[0.03944595512349...| 5.431161966795079|
|      6|[6.3,0.3,0.34,1.6...|[0.04726589062278...| 5.325026738315898|
|      6|[8.1,0.22,0.43,1....|[0.06100986750292...|5.4239064296019714|
|      5|[8.1,0.27,0.41,1....|[0.12333482598102...| 5.775792588615204|
|     

### Pipeline + wybór najlepszych parametrów

In [326]:
cols = [x for x in wine_red.columns if x != "quality"]
vectA = feature.VectorAssembler()#inputCols = cols, outputCol = "featuresRaw")
norm = feature.Normalizer()#inputCol="featuresRaw", outputCol="features")
forestReg = regression.RandomForestRegressor()
pipe = Pipeline(stages=[vectA, norm, forestReg])

In [327]:
# baseOn - sposob podawania stalego parametru do wszystkich kombinacji
paramGrid = tuning.ParamGridBuilder() \
.baseOn([vectA.inputCols, cols]) \#podanie do grida stałych parametrów, ale można też przy inicjalizacji metod
.baseOn([vectA.outputCol, 'featuresRaw']) \
.baseOn([norm.inputCol, 'featuresRaw']) \
.baseOn([norm.outputCol, 'features']) \
.addGrid(norm.p, [1.0, 2.0]) \
.baseOn([forestReg.labelCol, 'quality']) \
.addGrid(forestReg.maxDepth, [5, 6, 7, 8]) \
.build()

In [328]:
regr_eval = evaluation.RegressionEvaluator(labelCol= "quality")

In [329]:
# utworzenie estymatora
crossval = tuning.CrossValidator(estimator=pipe,
                                 estimatorParamMaps=paramGrid,
                                 evaluator=regr_eval,
                                 numFolds=4, 
                                 parallelism=2)

In [330]:
# utworzenie transformera
crossvalModel = crossval.fit(wine_red)

In [331]:
# transformacja (predykcja)
crossvalModel.transform(wine_white).drop(*cols).show()

+-------+--------------------+--------------------+-----------------+
|quality|         featuresRaw|            features|       prediction|
+-------+--------------------+--------------------+-----------------+
|      6|[7.0,0.27,0.36,20...|[0.02717117061166...|5.492129629629629|
|      6|[6.3,0.3,0.34,1.6...|[0.03708652934839...|6.156203703703704|
|      6|[8.1,0.28,0.4,6.9...|[0.05109600940166...|5.794769962431433|
|      6|[7.2,0.23,0.32,8....|[0.02719098951032...|5.517750257997935|
|      6|[7.2,0.23,0.32,8....|[0.02719098951032...|5.517750257997935|
|      6|[8.1,0.28,0.4,6.9...|[0.05109600940166...|5.794769962431433|
|      6|[6.2,0.32,0.16,7....|[0.03179978037635...|5.374074074074074|
|      6|[7.0,0.27,0.36,20...|[0.02717117061166...|5.492129629629629|
|      6|[6.3,0.3,0.34,1.6...|[0.03708652934839...|6.156203703703704|
|      6|[8.1,0.22,0.43,1....|[0.04403183773669...|5.719166666666666|
|      5|[8.1,0.27,0.41,1....|[0.07956480995797...|5.821956361162924|
|      5|[8.6,0.23,0

In [332]:
regr_eval.evaluate(crossvalModel.transform(wine_white))

0.9329579422389

> **ZADANIE:**
- stwórz model jak najlepiej przewidujący liczbę pierścieni (wiek) mięczaków
- do problemu można podejść jak do regresji lub jak do klasyfikacji
- wszystkie chwyty dozwolone

http://archive.ics.uci.edu/ml/datasets/Abalone

In [337]:
colNames = ["Sex", "Length", "Diameter", "Height", "Whole_weight", "Shucked_weight", 
            "Viscera_weight", "Shell_weight", "Rings"]

In [338]:
abalone = spark.read.csv("./abalone.data", header=False, inferSchema=True)

In [339]:
abalone = abalone.select(*[f.col(old).alias(new) for old, new in zip(abalone.columns, colNames)])

In [340]:
abalone.show()

+---+------+--------+------+------------+--------------+--------------+------------+-----+
|Sex|Length|Diameter|Height|Whole_weight|Shucked_weight|Viscera_weight|Shell_weight|Rings|
+---+------+--------+------+------------+--------------+--------------+------------+-----+
|  M| 0.455|   0.365| 0.095|       0.514|        0.2245|         0.101|        0.15|   15|
|  M|  0.35|   0.265|  0.09|      0.2255|        0.0995|        0.0485|        0.07|    7|
|  F|  0.53|    0.42| 0.135|       0.677|        0.2565|        0.1415|        0.21|    9|
|  M|  0.44|   0.365| 0.125|       0.516|        0.2155|         0.114|       0.155|   10|
|  I|  0.33|   0.255|  0.08|       0.205|        0.0895|        0.0395|       0.055|    7|
|  I| 0.425|     0.3| 0.095|      0.3515|         0.141|        0.0775|        0.12|    8|
|  F|  0.53|   0.415|  0.15|      0.7775|         0.237|        0.1415|        0.33|   20|
|  F| 0.545|   0.425| 0.125|       0.768|         0.294|        0.1495|        0.26|   16|

In [453]:
abalone = abalone.dropna()

In [454]:
# code string values
indexer = feature.StringIndexer(inputCol='Sex', outputCol='Sexidx')
idx_model = indexer.fit(abalone)
abalone_proc = idx_model.transform(abalone)

In [455]:
# set label column
abalone_proc = abalone_proc.withColumnRenamed('Rings','label')

In [456]:
abalone_proc.show()

+---+------+--------+------+------------+--------------+--------------+------------+-----+------+
|Sex|Length|Diameter|Height|Whole_weight|Shucked_weight|Viscera_weight|Shell_weight|label|Sexidx|
+---+------+--------+------+------------+--------------+--------------+------------+-----+------+
|  M| 0.455|   0.365| 0.095|       0.514|        0.2245|         0.101|        0.15|   15|   0.0|
|  M|  0.35|   0.265|  0.09|      0.2255|        0.0995|        0.0485|        0.07|    7|   0.0|
|  F|  0.53|    0.42| 0.135|       0.677|        0.2565|        0.1415|        0.21|    9|   2.0|
|  M|  0.44|   0.365| 0.125|       0.516|        0.2155|         0.114|       0.155|   10|   0.0|
|  I|  0.33|   0.255|  0.08|       0.205|        0.0895|        0.0395|       0.055|    7|   1.0|
|  I| 0.425|     0.3| 0.095|      0.3515|         0.141|        0.0775|        0.12|    8|   1.0|
|  F|  0.53|   0.415|  0.15|      0.7775|         0.237|        0.1415|        0.33|   20|   2.0|
|  F| 0.545|   0.425

In [457]:
# one hot encoding for categorical column
one_hot = feature.OneHotEncoderEstimator(inputCols=['Sexidx'],outputCols=['sex_vect'])

In [458]:
abalone_proc = one_hot.fit(abalone_proc).transform(abalone_proc)

In [459]:
abalone_proc.show()

+---+------+--------+------+------------+--------------+--------------+------------+-----+------+-------------+
|Sex|Length|Diameter|Height|Whole_weight|Shucked_weight|Viscera_weight|Shell_weight|label|Sexidx|     sex_vect|
+---+------+--------+------+------------+--------------+--------------+------------+-----+------+-------------+
|  M| 0.455|   0.365| 0.095|       0.514|        0.2245|         0.101|        0.15|   15|   0.0|(2,[0],[1.0])|
|  M|  0.35|   0.265|  0.09|      0.2255|        0.0995|        0.0485|        0.07|    7|   0.0|(2,[0],[1.0])|
|  F|  0.53|    0.42| 0.135|       0.677|        0.2565|        0.1415|        0.21|    9|   2.0|    (2,[],[])|
|  M|  0.44|   0.365| 0.125|       0.516|        0.2155|         0.114|       0.155|   10|   0.0|(2,[0],[1.0])|
|  I|  0.33|   0.255|  0.08|       0.205|        0.0895|        0.0395|       0.055|    7|   1.0|(2,[1],[1.0])|
|  I| 0.425|     0.3| 0.095|      0.3515|         0.141|        0.0775|        0.12|    8|   1.0|(2,[1],

In [460]:
abalone.printSchema()

root
 |-- Sex: string (nullable = true)
 |-- Length: double (nullable = true)
 |-- Diameter: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Whole_weight: double (nullable = true)
 |-- Shucked_weight: double (nullable = true)
 |-- Viscera_weight: double (nullable = true)
 |-- Shell_weight: double (nullable = true)
 |-- Rings: integer (nullable = true)



In [461]:
abalone.dtypes

[('Sex', 'string'),
 ('Length', 'double'),
 ('Diameter', 'double'),
 ('Height', 'double'),
 ('Whole_weight', 'double'),
 ('Shucked_weight', 'double'),
 ('Viscera_weight', 'double'),
 ('Shell_weight', 'double'),
 ('Rings', 'int')]

In [462]:
num_cols = [c for c,t in abalone.dtypes if t != 'string' and c != 'Rings']

In [463]:
num_cols

['Length',
 'Diameter',
 'Height',
 'Whole_weight',
 'Shucked_weight',
 'Viscera_weight',
 'Shell_weight']

In [464]:
#create vectors with all variables
vectAssembler = feature.VectorAssembler(inputCols=num_cols, outputCol = "featuresRaw")
abalone_proc = vectAssembler.transform(abalone_proc)
abalone_proc.printSchema()

root
 |-- Sex: string (nullable = true)
 |-- Length: double (nullable = true)
 |-- Diameter: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Whole_weight: double (nullable = true)
 |-- Shucked_weight: double (nullable = true)
 |-- Viscera_weight: double (nullable = true)
 |-- Shell_weight: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- Sexidx: double (nullable = false)
 |-- sex_vect: vector (nullable = true)
 |-- featuresRaw: vector (nullable = true)



In [466]:
scaler = feature.StandardScaler(inputCol='featuresRaw',outputCol='featuresRaw2')
scalerModel = scaler.fit(abalone_proc)

In [467]:
abalone_proc = scalerModel.transform(abalone_proc)

In [468]:
vectAssembler2 = feature.VectorAssembler(inputCols=['featuresRaw2','sex_vect'],outputCol='features')
abalone_proc = vectAssembler2.transform(abalone_proc)

In [469]:
abalone_proc.printSchema()

root
 |-- Sex: string (nullable = true)
 |-- Length: double (nullable = true)
 |-- Diameter: double (nullable = true)
 |-- Height: double (nullable = true)
 |-- Whole_weight: double (nullable = true)
 |-- Shucked_weight: double (nullable = true)
 |-- Viscera_weight: double (nullable = true)
 |-- Shell_weight: double (nullable = true)
 |-- label: integer (nullable = true)
 |-- Sexidx: double (nullable = false)
 |-- sex_vect: vector (nullable = true)
 |-- featuresRaw: vector (nullable = true)
 |-- featuresRaw2: vector (nullable = true)
 |-- features: vector (nullable = true)



In [470]:
abalone_proc = abalone_proc.select("label", "features")
abalone_proc.cache()

DataFrame[label: int, features: vector]

In [471]:
abalone_train, abalone_eval = abalone_proc.randomSplit([0.7, 0.3], 42)
abalone_train.cache()
abalone_eval.cache()

DataFrame[label: int, features: vector]

In [472]:
reg = regression.LinearRegression(maxIter=500)

In [473]:
regModel = reg.fit(abalone_train)

In [474]:
regModel.transform(abalone_train)

DataFrame[label: int, features: vector, prediction: double]

In [475]:
trainSummary = regModel.summary

In [476]:
trainSummary.r2

0.5402893599826477

In [477]:
trainSummary.meanSquaredError

4.727508285144403

In [478]:
regforest = regression.RandomForestRegressor(maxDepth=10,numTrees=30)
forestModel = regforest.fit(abalone_train)

In [480]:
forestModel.transform(abalone_train).show()

+-----+--------------------+-----------------+
|label|            features|       prediction|
+-----+--------------------+-----------------+
|    1|[0.62451645478688...|4.363302147053985|
|    3|[1.16576404893552...|4.363302147053985|
|    3|[1.29066733989289...|4.552328544469287|
|    3|[1.33230177021202...|4.363302147053985|
|    3|[1.37393620053115...|4.363302147053985|
|    3|[1.37393620053115...|4.363302147053985|
|    3|[1.49883949148852...|4.552328544469287|
|    3|[1.49883949148852...|4.363302147053985|
|    3|[1.58210835212677...|4.363302147053985|
|    3|[1.62374278244590...|4.363302147053985|
|    3|[1.74864607340328...|4.552328544469287|
|    3|[1.79028050372240...|4.363302147053985|
|    4|[1.08249518829726...|4.363302147053985|
|    4|[1.12412961861639...|4.363302147053985|
|    4|[1.16576404893552...|4.363302147053985|
|    4|[1.29066733989289...|4.363302147053985|
|    4|[1.33230177021202...|4.363302147053985|
|    4|[1.33230177021202...|4.363302147053985|
|    4|[1.373