# 1.1 성적 데이터로 DataFrame 생성

In [3]:
marks=[
    "김하나, English, 100",
    "김하나, Math, 80",
    "임하나, English, 70",
    "임하나, Math, 100",
    "김갑돌, English, 82.3",
    "김갑돌, Math, 98.5"
]

In [4]:
import pyspark

myConf = pyspark.SparkConf()
spark = pyspark.sql.SparkSession\
    .builder\
    .master('local')\
    .config(conf=myConf)\
    .appName('AssignmentWeek7')\
    .getOrCreate()

21/10/17 19:41:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
from pyspark.sql import Row

scoreRdd = spark.sparkContext.parallelize(marks)
scoreRdd = scoreRdd.map(lambda x: x.split(','))
scoreRdd = scoreRdd.map(lambda x:
                            Row(
                                Name=x[0],
                                Subject=x[1],
                                Score=float(x[2])
                            )
                       )
scoreDf = spark.createDataFrame(scoreRdd)
scoreDf.printSchema()
scoreDf.show()

                                                                                

root
 |-- Name: string (nullable = true)
 |-- Subject: string (nullable = true)
 |-- Score: double (nullable = true)

+------+--------+-----+
|  Name| Subject|Score|
+------+--------+-----+
|김하나| English|100.0|
|김하나|    Math| 80.0|
|임하나| English| 70.0|
|임하나|    Math|100.0|
|김갑돌| English| 82.3|
|김갑돌|    Math| 98.5|
+------+--------+-----+



# 1-2 zscore 컬럼을 생성.

- zscore를 계산하려면, 평균과 표준편차를 알아야 한다. <br>

- 계산식에 F함수를 직접 사용하면 오류가 발생한다. 따로 평균과 표준편차를 구해서 계산식에서 사용해야 한다.

In [6]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

PscorePd = scoreDf.toPandas()
zscore(np.array(PscorePd["Score"]))

array([ 0.98810773, -0.72537388, -1.58211469,  0.98810773, -0.5283235 ,
        0.85959661])

In [7]:
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType

import numpy as np

scoreCol = scoreDf.select("Score").toPandas().to_numpy()
u = scoreCol.mean()
s = scoreCol.std(ddof=1)

zscoreUdf = F.udf(lambda x: (x - float(u)) / float(s), FloatType())
scoreDf2 = scoreDf.withColumn("zscore", zscoreUdf(scoreDf["Score"]))
scoreDf2.show()

+------+--------+-----+-----------+
|  Name| Subject|Score|     zscore|
+------+--------+-----+-----------+
|김하나| English|100.0| 0.90201485|
|김하나|    Math| 80.0|-0.66217273|
|임하나| English| 70.0| -1.4442666|
|임하나|    Math|100.0| 0.90201485|
|김갑돌| English| 82.3|-0.48229116|
|김갑돌|    Math| 98.5| 0.78470075|
+------+--------+-----+-----------+



# 1-3 cdf 컬럼을 생성.

- scipy.stats.norm.cdf() 함수는 데이터타입을 float로 맞추어 주어야 한다.<br>

- cdf는 평균=0, 표준편차=1을 기본 값으로 누적확률을 계산한다.

In [8]:
from scipy.stats import norm

cdfUdf = F.udf(lambda x: float(norm.cdf(x)), FloatType())

In [9]:
scoreDf3 = scoreDf2.withColumn("cdf", cdfUdf(scoreDf2["zscore"]))
scoreDf3.show()

+------+--------+-----+-----------+-----------+
|  Name| Subject|Score|     zscore|        cdf|
+------+--------+-----+-----------+-----------+
|김하나| English|100.0| 0.90201485|  0.8164755|
|김하나|    Math| 80.0|-0.66217273| 0.25393027|
|임하나| English| 70.0| -1.4442666|0.074332014|
|임하나|    Math|100.0| 0.90201485|  0.8164755|
|김갑돌| English| 82.3|-0.48229116| 0.31479958|
|김갑돌|    Math| 98.5| 0.78470075|  0.7836855|
+------+--------+-----+-----------+-----------+



In [10]:
spark.stop()