In [21]:
import findspark
findspark.init()

import pyspark
from pyspark import SparkContext, SQLContext, Row, SparkConf
from datetime import datetime

# Start Spark Context and sqlContext

In [8]:
conf = SparkConf().setAppName("nasa_data_analysis").setMaster("local")
sc = SparkContext(conf=conf)

In [9]:
sqlContext = SQLContext(sc)

# Load Data 

In [10]:
rdd_data = sc.textFile("../data/access_log_Aug95")

# Take first element to see the lines structure 

In [12]:
sample = rdd_data.take(1)[0]
sample

'in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0" 200 1839'

# Define the function to map the line to an dataframe 

In [16]:
def split_line(line):
    host = line.split(" ")[0]
    timestamp = line.split("[")[1].split("]")[0]
    date = datetime.strptime(timestamp,"%d/%b/%Y:%H:%M:%S %z")
    temp = line.split("\"")
    request = temp[1]
    nothing,code,transfered_bytes = temp[-1].split(" ")
    
    return [host, date, request, code, transfered_bytes]

#### Reference to convert a string to datetime 

https://stackoverflow.com/questions/466345/converting-string-into-datetime

https://docs.python.org/2/library/datetime.html#strftime-strptime-behavior

In [None]:
# Convert the RDD to an DataFrame 

In [17]:
header = ['host', 'date', 'request', 'code', 'transfered_bytes']

In [19]:
maped = rdd_data.map(split_line)

In [22]:
df_data = maped.toDF(header)

In [23]:
df_data.show(10)

+--------------------+-------------------+--------------------+----+----------------+
|                host|               date|             request|code|transfered_bytes|
+--------------------+-------------------+--------------------+----+----------------+
|   in24.inetnebr.com|1995-08-01 01:00:01|GET /shuttle/miss...| 200|            1839|
|     uplherc.upl.com|1995-08-01 01:00:07|      GET / HTTP/1.0| 304|               0|
|     uplherc.upl.com|1995-08-01 01:00:08|GET /images/ksclo...| 304|               0|
|     uplherc.upl.com|1995-08-01 01:00:08|GET /images/MOSAI...| 304|               0|
|     uplherc.upl.com|1995-08-01 01:00:08|GET /images/USA-l...| 304|               0|
|ix-esc-ca2-07.ix....|1995-08-01 01:00:09|GET /images/launc...| 200|            1713|
|     uplherc.upl.com|1995-08-01 01:00:10|GET /images/WORLD...| 304|               0|
|slppp6.intermind.net|1995-08-01 01:00:10|GET /history/skyl...| 200|            1687|
|piweba4y.prodigy.com|1995-08-01 01:00:10|GET /images/

# Number of unique hosts 

In [25]:
df_data.select("host").distinct().count()

75060

# Number of 404 errors

In [None]:
df_404 = df_data.filter("code = 404")
df_404.count()

In [28]:
# 5 most error URL 

# Stop Context 

In [6]:
sc.stop()