In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySpark Example").getOrCreate()

# RDD 분산 데이터

In [4]:
# RDD 생성 - 분산데이터객체
rdd = spark.sparkContext.parallelize([1,2,3,4])
rdd

ParallelCollectionRDD[2] at readRDDFromFile at PythonRDD.scala:274

In [5]:
rdd.take(5)

                                                                                

[1, 2, 3, 4]

In [6]:
squared_rdd = rdd. map(lambda x: x *x)
squared_rdd

PythonRDD[5] at RDD at PythonRDD.scala:53

In [7]:
squared_rdd.take(5)

[1, 4, 9, 16]

In [8]:
squared_rdd.collect()

[1, 4, 9, 16]

# 데이터프레임 객체

In [10]:
data = [('Alice', 1), ('Bob', 2), ('Charlie', 3)]
df = spark.createDataFrame(data, ['Name', 'Value'])
df

DataFrame[Name: string, Value: bigint]

In [11]:
df.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



# RDBMS 데이터 - 테이블

In [13]:
df.createOrReplaceTempView('people')

In [16]:
select_sql = 'SELECT * FROM people WHERE Value > 2'

In [17]:
result_sql = spark.sql(select_sql)
result_sql.show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



In [18]:
# DataFrame 생성 -> select DF 추출
data2 = [('Apple', 2000), ('Grape', 5000), ('Banana', 6000)]
df2 = spark.createDataFrame(data2, ['Fruit', 'Price'])
df2

DataFrame[Fruit: string, Price: bigint]

In [19]:
df2.show()

+------+-----+
| Fruit|Price|
+------+-----+
| Apple| 2000|
| Grape| 5000|
|Banana| 6000|
+------+-----+



In [28]:
df2.createOrReplaceTempView('t')

In [29]:
select_sql2 = 'SELECT * FROM t WHERE Price < 5000'

In [33]:
result_sql2 = spark.sql(select_sql2)
result_sql2.show()

+-----+-----+
|Fruit|Price|
+-----+-----+
|Apple| 2000|
+-----+-----+



In [34]:
!pip install numpy

Collecting numpy
  Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m143.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.24.4


# MLib

In [35]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [39]:
assembler = VectorAssembler(inputCols=['Price'], outputCol='features')
vector_df = assembler.transform(df2)
lr = LinearRegression(featuresCol='features', labelCol='Price')
model = lr.fit(vector_df)

24/12/03 11:34:52 WARN Instrumentation: [02c3cd33] regParam is zero, which might cause numerical instability and overfitting.
24/12/03 11:34:53 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/03 11:34:53 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/12/03 11:34:53 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/12/03 11:34:53 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


In [40]:
pred = model.transform(vector_df)
pred.show()

+------+-----+--------+-----------------+
| Fruit|Price|features|       prediction|
+------+-----+--------+-----------------+
| Apple| 2000|[2000.0]|2000.000000000003|
| Grape| 5000|[5000.0]|4999.999999999999|
|Banana| 6000|[6000.0]|5999.999999999997|
+------+-----+--------+-----------------+

