In [1]:
print('hello')

hello


In [2]:
! which python

/opt/conda/bin/python


In [3]:
!python --version

Python 3.11.6


In [4]:
!java -version

openjdk version "17.0.8.1" 2023-08-24
OpenJDK Runtime Environment (build 17.0.8.1+1-Ubuntu-0ubuntu122.04)
OpenJDK 64-Bit Server VM (build 17.0.8.1+1-Ubuntu-0ubuntu122.04, mixed mode, sharing)


In [5]:
!which java

/usr/bin/java


In [6]:
pip show pyspark

Name: pyspark
Version: 3.5.0
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /usr/local/spark/python
Requires: py4j
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [7]:
# SPARK_HOME
import os
os.environ.get('SPARK_HOME')

'/usr/local/spark'

In [8]:
# JAVA_HOME
os.environ.get('JAVA_HOME')

In [9]:
# PYTHONPATH
os.environ.get('PYTHONPATH')

'/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip:/usr/local/spark/python:'

In [10]:
import pyspark

In [11]:
from pyspark.sql import SparkSession
# SparkContext.SparkSession 객체
spark = SparkSession.builder.appName('pyspark example1').getOrCreate()  # chaining

In [12]:
spark

In [13]:
spark.stop()

In [14]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate() # 있으면 이미 있던거 그대로 가져옴

In [15]:
spark

In [16]:
data = [('Alice', 1), ('Bob', 2), ('Charlie', 3)]
type(data)

list

In [17]:
# DataFrame 객체를 생성 -> pandas의 데이터프레임이 아님
# pyspark의 분산 객체
data1 = spark.createDataFrame(data, ['Name', 'Value'])
data1

DataFrame[Name: string, Value: bigint]

data1.show()  # show()는 데이터프레임의 함수

In [18]:
data1.filter(data1.Name == "Bob").show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [19]:
data1.filter(data1.Value > 2).show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



In [20]:
data1.createOrReplaceTempView('people')  # 뷰 생성

In [21]:
spark.sql('select * from people').show()  # sql문으로 조회하기

+-------+-----+
|   Name|Value|
+-------+-----+
|  Alice|    1|
|    Bob|    2|
|Charlie|    3|
+-------+-----+



In [22]:
spark.sql('select * from people where Name="Bob"').show()

+----+-----+
|Name|Value|
+----+-----+
| Bob|    2|
+----+-----+



In [23]:
spark.sql('select * from people where Value > 2').show()

+-------+-----+
|   Name|Value|
+-------+-----+
|Charlie|    3|
+-------+-----+



# RDD 객체 생성

In [24]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate()

In [25]:
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5]) # 병렬로 관리되는 rdd 객체
rdd

ParallelCollectionRDD[15] at readRDDFromFile at PythonRDD.scala:289

In [26]:
rdd.take(5)  # rdd 객체 출력 함수 (갯수 지정 필요!)

[1, 2, 3, 4, 5]

In [27]:
# map 연산 : rdd 값으로 연산을 할 때 사용
squared_rdd = rdd.map(lambda x: x*x)
squared_rdd

PythonRDD[18] at RDD at PythonRDD.scala:53

In [28]:
rdd.take(3)  # 그대로 있음

[1, 2, 3]

In [29]:
squared_rdd.take(3)

[1, 4, 9]

In [30]:
# 한꺼번에 다 가져올 때 (전체)
squared_rdd.collect()

[1, 4, 9, 16, 25]

# MLlib

In [31]:
from pyspark.ml.regression import LinearRegression

In [32]:
import numpy as np

In [33]:
from pyspark.ml.feature import VectorAssembler  # feature를 벡터로 만들어줌

In [34]:
data_age = [('Alice', 17), ('Bob', 30), ('Charlie', 23)]
data2 = spark.createDataFrame(data_age, ['Name', 'Age'])
data2

DataFrame[Name: string, Age: bigint]

In [35]:
assembler = VectorAssembler(inputCols = ['Age'], outputCol = 'feature')
vector_df = assembler.transform(data2)
vector_df

DataFrame[Name: string, Age: bigint, feature: vector]

In [36]:
lr = LinearRegression(featuresCol = 'feature', labelCol = 'Age')
model = lr.fit(vector_df)

In [37]:
pred = model.transform(vector_df)
pred

DataFrame[Name: string, Age: bigint, feature: vector, prediction: double]

In [38]:
pred.show()

+-------+---+-------+------------------+
|   Name|Age|feature|        prediction|
+-------+---+-------+------------------+
|  Alice| 17| [17.0]|16.999999999999975|
|    Bob| 30| [30.0]|30.000000000000032|
|Charlie| 23| [23.0]|              23.0|
+-------+---+-------+------------------+



In [40]:
spark.stop()

# Streaming

In [39]:
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

In [41]:
spark = SparkSession.builder.appName('pyspark example1').getOrCreate()

In [None]:
lines = spark.readStream.format('socket')\
        .option('host', 'localhost')\
        .option('port', 9999)\
        .load()  # STREAMING DATA

In [None]:
words = lines.select(explode(split(lines.value, ' ')).alias('word')

In [None]:
words

In [42]:
spark.stop()