In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()
sc = spark.sparkContext

In [2]:
from pyspark.sql.functions import col, countDistinct, desc, year, month, dayofmonth, hour, dayofweek, to_date, to_timestamp
import datetime
from pyspark.sql.types import DateType

In [3]:
#load the dataset and shoe the first few columns to confirm the format
infoDF = spark.read.csv("s3://502finalprojbucky/InfOpEnglish/*.csv")

In [4]:
infoDF.show(5)

+---+---+---+--------------------+----------------+
|_c0|_c1|_c2|                 _c3|             _c4|
+---+---+---+--------------------+----------------+
| AL| en| en|Pain is nothing c...|2014-12-15 17:18|
| AL| en| en|HOW CAN YOU NOT LOVE|2014-12-23 13:36|
| AL| en| en|RT @TheMinks: Who...|2016-11-30 16:43|
| AL| en| en|RT @f46e654ff3f1f...|2016-09-19 09:49|
| AL| en| en|RT @lesmarie99017...|2016-12-06 07:24|
+---+---+---+--------------------+----------------+
only showing top 5 rows



In [5]:
#convert the datetime stamp to a date column for future analysis
infoDF = infoDF.withColumn('dt_stamp', to_timestamp(col('_c4'), "yyyy-MM-dd HH:mm").cast("timestamp"))
infoDF.show(5)

+---+---+---+--------------------+----------------+-------------------+
|_c0|_c1|_c2|                 _c3|             _c4|           dt_stamp|
+---+---+---+--------------------+----------------+-------------------+
| AL| en| en|Pain is nothing c...|2014-12-15 17:18|2014-12-15 17:18:00|
| AL| en| en|HOW CAN YOU NOT LOVE|2014-12-23 13:36|2014-12-23 13:36:00|
| AL| en| en|RT @TheMinks: Who...|2016-11-30 16:43|2016-11-30 16:43:00|
| AL| en| en|RT @f46e654ff3f1f...|2016-09-19 09:49|2016-09-19 09:49:00|
| AL| en| en|RT @lesmarie99017...|2016-12-06 07:24|2016-12-06 07:24:00|
+---+---+---+--------------------+----------------+-------------------+
only showing top 5 rows



In [6]:
#drop columns with null/na values for summary statistics 
infoDF = infoDF.filter(infoDF.dt_stamp.isNotNull())
infoDF.count()

8536158

In [7]:
#generate summary statistics for date information & write to output
dateCounts = infoDF.select("dt_stamp",to_date("dt_stamp",'yyyy-MM-dd').alias("date"))\
                   .groupBy('date').count().orderBy(desc('count'))
dateCounts.show(5)

+----------+-----+
|      date|count|
+----------+-----+
|2015-03-18|32066|
|2017-01-30|31241|
|2017-01-27|30028|
|2017-01-31|27414|
|2017-01-26|26415|
+----------+-----+
only showing top 5 rows



In [9]:
dateCounts.repartition(1).write.csv("infoOpsDateCounts2.csv", sep=',')

In [11]:
#generate summary statistics for day of the week
dowCounts = infoDF.select("dt_stamp",dayofweek("dt_stamp").alias("dow"))\
                   .groupBy('dow').count().orderBy(desc('count'))
dowCounts.show(5)

+---+-------+
|dow|  count|
+---+-------+
|  4|1354785|
|  2|1286553|
|  3|1244059|
|  5|1238643|
|  6|1227444|
+---+-------+
only showing top 5 rows



In [12]:
dowCounts.repartition(1).write.csv("infoOpsDOWCounts2.csv", sep=',')

In [33]:
spark.stop()

In [34]:
sc.stop()