# Create spark session

In [93]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType, DateType
from pyspark.sql import functions as F

my_conf = SparkConf()
my_conf.set("spark.app.name", "My Application")
my_conf.set("spark.ui.port", "4051")

spark = SparkSession \
            .builder \
            .config(conf=my_conf) \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

In [15]:
# Load the data in a Python List 

#DATASET_PATH = 's3://data-engg-suman/dataset/bigLog2.txt'

DATASET_PATH = 's3://data-engg-suman/dataset/bigLog_sample.txt'

rdd1 = spark.sparkContext.textFile(DATASET_PATH)

In [16]:
rdd1.take(4)

['DEBUG,2015-2-6 16:24:07',
 'WARN,2016-7-26 18:54:43',
 'INFO,2012-10-18 14:35:19',
 'DEBUG,2012-4-26 14:26:50']

In [40]:
from datetime import datetime

def line_parser(line):

    each_line = line.split(",")
    date_string = each_line[1]
    date_object = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S")

    return each_line[0], date_object

rdd2 = rdd1.map(line_parser)
    

In [47]:
df1 = rdd2.toDF(['level', 'datetime'])
df1.show(3)

+-----+-------------------+
|level|           datetime|
+-----+-------------------+
|DEBUG|2015-02-06 16:24:07|
| WARN|2016-07-26 18:54:43|
| INFO|2012-10-18 14:35:19|
+-----+-------------------+
only showing top 3 rows



In [56]:
df1.createOrReplaceTempView('my_logging_table')

spark.sql('SELECT level, collect_list(datetime), count(datetime) AS count FROM my_logging_table GROUP BY level ORDER BY level').show(truncate=True)

+-----+----------------------+-----+
|level|collect_list(datetime)|count|
+-----+----------------------+-----+
|DEBUG|  [2015-02-06 16:24...|   52|
|ERROR|  [2015-06-28 19:25...|    3|
|FATAL|  [2015-11-21 23:17...|    1|
| INFO|  [2012-10-18 14:35...|   29|
| WARN|  [2016-07-26 18:54...|   14|
+-----+----------------------+-----+



In [58]:
spark.sql('SELECT level, date_format(datetime, "MMMM") FROM my_logging_table').show(truncate=True)

+-----+---------------------------+
|level|date_format(datetime, MMMM)|
+-----+---------------------------+
|DEBUG|                   February|
| WARN|                       July|
| INFO|                    October|
|DEBUG|                      April|
|DEBUG|                  September|
| INFO|                     August|
| INFO|                      April|
|DEBUG|                       July|
|DEBUG|                       July|
| INFO|                    January|
|DEBUG|                    January|
|DEBUG|                    January|
|DEBUG|                       July|
|DEBUG|                        May|
|DEBUG|                      March|
| INFO|                     August|
| WARN|                    January|
| INFO|                       June|
| INFO|                    January|
|DEBUG|                       July|
+-----+---------------------------+
only showing top 20 rows



In [63]:
df2 = spark.sql('SELECT level, date_format(datetime, "MMMM") AS month FROM my_logging_table')
df2.show(2)

+-----+--------+
|level|   month|
+-----+--------+
|DEBUG|February|
| WARN|    July|
+-----+--------+
only showing top 2 rows



In [64]:
df2.createOrReplaceTempView('my_logging_table_2')
spark.sql('SELECT * FROM my_logging_table_2').show(3)


+-----+--------+
|level|   month|
+-----+--------+
|DEBUG|February|
| WARN|    July|
| INFO| October|
+-----+--------+
only showing top 3 rows



In [67]:
spark.sql('SELECT level, month, count(*) FROM my_logging_table_2 GROUP BY level, month').show()


+-----+---------+--------+
|level|    month|count(1)|
+-----+---------+--------+
| INFO|September|       2|
|ERROR|     June|       2|
|DEBUG| November|       3|
| INFO|  October|       3|
| INFO| December|       1|
| WARN|  January|       1|
| INFO|    March|       3|
|DEBUG|  January|       7|
| INFO|    April|       2|
|DEBUG|    March|       3|
|DEBUG|    April|       3|
|DEBUG|   August|       4|
| INFO|     June|       2|
|DEBUG| December|       3|
| INFO|      May|       2|
|DEBUG| February|       3|
| INFO|   August|       4|
|DEBUG|     June|       4|
| INFO|  January|       2|
| WARN|     July|       1|
+-----+---------+--------+
only showing top 20 rows



In [184]:
spark.stop()