In [1]:
import pandas as pd
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder. \
master("local[4]"). \
appName("Dataframe-Giriş"). \
config("spark.driver.memory","2g"). \
config("spark.executor.memory","4g"). \
getOrCreate()

sc = spark.sparkContext

## from list ##

In [5]:
from pyspark.sql import Row
list_rdd = sc.parallelize([1,2,3,4,5,6,5,4]).map(lambda x : Row(x))
list_rdd.take(3)

[<Row(1)>, <Row(2)>, <Row(3)>]

In [6]:
df_from_list = list_rdd.toDF(["rakamlar"])
df_from_list.show()

+--------+
|rakamlar|
+--------+
|       1|
|       2|
|       3|
|       4|
|       5|
|       6|
|       5|
|       4|
+--------+



In [11]:
df_from_range = sc.parallelize(range(10,100,5)).map(lambda x: (x,)).toDF(["range"])
df_from_range.show(4)

+-----+
|range|
+-----+
|   10|
|   15|
|   20|
|   25|
+-----+
only showing top 4 rows



In [12]:
from pyspark.sql.types import IntegerType
df_from_range2 = spark.createDataFrame(range(10,100,5), IntegerType())

 ## Dosyadan DF ##


In [13]:
df_from_file = spark.read.csv("sources\\OnlineRetail.csv")
df_from_file.show()


+--------------------+
|                 _c0|
+--------------------+
|InvoiceNo;StockCo...|
|536365;85123A;WHI...|
|536365;71053;WHIT...|
|536365;84406B;CRE...|
|536365;84029G;KNI...|
|536365;84029E;RED...|
|536365;22752;SET ...|
|536365;21730;GLAS...|
|536366;22633;HAND...|
|536366;22632;HAND...|
|536367;84879;ASSO...|
|536367;22745;POPP...|
|536367;22748;POPP...|
|536367;22749;FELT...|
|536367;22310;IVOR...|
|536367;84969;BOX ...|
|536367;22623;BOX ...|
|536367;22622;BOX ...|
|536367;21754;HOME...|
|536367;21755;LOVE...|
+--------------------+
only showing top 20 rows



In [15]:
df_from_file = spark.read \
.option("sep",";") \
.option("header","True") \
.option("inferSchema","True") \
.csv("sources\\OnlineRetail.csv")

df_from_file.show(3)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 3 rows



In [16]:
df_from_file.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [17]:
type(df_from_file)

pyspark.sql.dataframe.DataFrame

In [19]:
df_pd = df_from_file.limit(5).toPandas()

type(df_pd)

pandas.core.frame.DataFrame

In [25]:
df_pd.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,1.12.2010 08:26,255,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,1.12.2010 08:26,339,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,1.12.2010 08:26,275,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,1.12.2010 08:26,339,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,1.12.2010 08:26,339,17850,United Kingdom


In [21]:
len(df_pd)

5

In [26]:
df_from_file.select("InvoiceNo","StockCode").show(10)

+---------+---------+
|InvoiceNo|StockCode|
+---------+---------+
|   536365|   85123A|
|   536365|    71053|
|   536365|   84406B|
|   536365|   84029G|
|   536365|   84029E|
|   536365|    22752|
|   536365|    21730|
|   536366|    22633|
|   536366|    22632|
|   536367|    84879|
+---------+---------+
only showing top 10 rows



In [27]:
df_from_file.sort("InvoiceNo").show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|1.12.2010 08:26|     4,25|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|1.12.2010 08:26|     7,65|     17850|United Kingdom|
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:

In [28]:
df_from_file.sort("InvoiceNo").explain()

== Physical Plan ==
*(1) Sort [InvoiceNo#111 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(InvoiceNo#111 ASC NULLS FIRST, 200), true, [id=#166]
   +- FileScan csv [InvoiceNo#111,StockCode#112,Description#113,Quantity#114,InvoiceDate#115,UnitPrice#116,CustomerID#117,Country#118] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/C:/Users/ertug/Desktop/Programming Languages/bigdata/spark/sources/Online..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<InvoiceNo:string,StockCode:string,Description:string,Quantity:int,InvoiceDate:string,UnitP...


