In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.master("local[4]") \
.appName("Csv-Üzeri-SQL") \
.config("spark.executor.memory","4g") \
.config("spark.driver.memory","2g") \
.getOrCreate()

# Select only date column while reading #

In [7]:
df = spark.read \
.option("header","True") \
.option("inferSchema","True") \
.option("sep",";") \
.csv("sources\\OnlineRetail.csv")

df.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [26]:
existing_format = 'dd.MM.yyyy HH:mm'

## Date Operations ##

In [29]:
from pyspark.sql import functions as F
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

df2 = df.withColumn("InvoiceDate", F.trim(F.col("InvoiceDate"))) \
.withColumn("normal_tarih", F.to_date(F.col("InvoiceDate"), existing_format)) \
.withColumn("standart_ts", F.to_timestamp(F.col("InvoiceDate"), existing_format)) \

df2.show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|normal_tarih|        standart_ts|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|
|   536365|   84029G|KNITTED UNION FLA...|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|
|   536365|   84029E|RED WOOLLY HOTTIE...

In [32]:
format_tr = "dd/MM/yyyy HH:mm:ss"
format_eng = "MM-dd-yyyy HH:mm:ss"

df3 = df2 \
.withColumn("TSTR", F.date_format(F.col("standart_ts"), format_tr)) \
.withColumn("TSENG", F.date_format(F.col("standart_ts"), format_eng)) \
.withColumn("unix_time", F.unix_timestamp(F.col("standart_ts"))) \

df3.select("TSTR","TSENG","unix_time").show(10)

+-------------------+-------------------+----------+
|               TSTR|              TSENG| unix_time|
+-------------------+-------------------+----------+
|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|01/12/2010 08:26:00|12-01-2010 08:26:00|1291184760|
|01/12/2010 08:28:00|12-01-2010 08:28:00|1291184880|
|01/12/2010 08:28:00|12-01-2010 08:28:00|1291184880|
|01/12/2010 08:34:00|12-01-2010 08:34:00|1291185240|
+-------------------+-------------------+----------+
only showing top 10 rows



In [33]:

df4 = df2 \
.withColumn("bir_yil", F.date_add(F.col("standart_ts"), 365)) \
.withColumn("yil", F.year(F.col("standart_ts"))) \
.withColumn("fark", F.datediff(F.col("bir_yil"), F.col("standart_ts")))


df4.show(10)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+----------+----+----+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|normal_tarih|        standart_ts|   bir_yil| yil|fark|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+------------+-------------------+----------+----+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|1.12.2010 08:26|     2,55|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|2011-12-01|2010| 365|
|   536365|    71053| WHITE METAL LANTERN|       6|1.12.2010 08:26|     3,39|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|2011-12-01|2010| 365|
|   536365|   84406B|CREAM CUPID HEART...|       8|1.12.2010 08:26|     2,75|     17850|United Kingdom|  2010-12-01|2010-12-01 08:26:00|2011-12-01|2010| 365|
|   536365|   84029G|KNITTED UNION FLA...|       6|1