# Pyspark Session 구축

In [1]:
from pyspark.sql import SparkSession
test1 = "C:/Users/Galaxy/Desktop/test1.csv"

In [2]:
spark = SparkSession.builder.appName('DataFrame').getOrCreate()



In [3]:
# - in -memory
# 로컬에서 실행시 기본적으로 하나의 Master Node가 있다.

spark

In [4]:
## reade the dataset
df_pyspark = spark.read.option('header','true').csv(test1)
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



In [5]:
# Check the Schema
df_pyspark.printSchema() # age와 Experience는 int형인데 string으로 인식함

root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)



In [6]:
# inferSchema사용시 가능
df_pyspark = spark.read.option('header','true').csv(test1,inferSchema=True)
df_pyspark.printSchema()
df_pyspark.show()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



In [7]:
df_pyspark = spark.read.csv(test1, header=True, inferSchema=True)
df_pyspark.show()
type(df_pyspark)

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



pyspark.sql.dataframe.DataFrame

# 인덱싱 칼럼 / 로우

In [8]:
df_pyspark.columns

['Name', 'age', 'Experience']

In [9]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

In [10]:
# select
df_pyspark.select('Name').show()
df_pyspark.select(['Name', 'Experience']).show()

+--------+
|    Name|
+--------+
|   Krish|
|Sudhansh|
|   Sunny|
+--------+

+--------+----------+
|    Name|Experience|
+--------+----------+
|   Krish|        10|
|Sudhansh|         8|
|   Sunny|         4|
+--------+----------+



In [11]:
df_pyspark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| null|30.0|7.333333333333333|
| stddev| null| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|Sunny|  31|               10|
+-------+-----+----+-----------------+



# Adding Columns / Dropping Columns

In [17]:
### Adding Columns in dataframe
df_pyspark = df_pyspark.withColumn('Experiecne After 2 year',df_pyspark['Experience']+2)

In [18]:
df_pyspark.show()

+--------+---+----------+-----------------------+
|    Name|age|Experience|Experiecne After 2 year|
+--------+---+----------+-----------------------+
|   Krish| 31|        10|                     12|
|Sudhansh| 30|         8|                     10|
|   Sunny| 29|         4|                      6|
+--------+---+----------+-----------------------+



In [21]:
### Dropping Columns in dataframe
df_pyspark = df_pyspark.drop('Experiecne After 2 year')

In [22]:
df_pyspark.show()

+--------+---+----------+
|    Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



In [23]:
### Rename the Column
df_pyspark.withColumnRenamed('Name', 'New Name').show()

+--------+---+----------+
|New Name|age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+

