In [1]:
from pyspark.context import SparkContext
from pyspark.sql.context import SQLContext
from pyspark.sql.session import SparkSession
    
sc = SparkContext()
sqlContext = SQLContext(sc)
spark = SparkSession(sc)

In [66]:
import re
import pandas as pd
import numpy as np

### Basic regular expression

In [3]:
m = re.finditer(r'.*?(spark).*?', "I'm searching for a spark in PySpark", re.I)
for match in m:
    print(match, match.start(), match.end())

<re.Match object; span=(0, 25), match="I'm searching for a spark"> 0 25
<re.Match object; span=(25, 36), match=' in PySpark'> 25 36


## Download data
1) Download it if they do not exist locally.

In [12]:
path_to_data_dir = "./data/"
if not os.path.exists(path_to_data_dir):
    os.mkdir(path_to_data_dir)
    print("Created data directory")

if not os.path.exists("./data/NASA_access_log_Jul95.gz"):
    !wget ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz 
    !mv ./NASA_access_log_Jul95.gz ./data/

if not os.path.exists("./data/NASA_access_log_Aug95.gz"):
    !wget ftp://ita.ee.lbl.gov/traces/NASA_access_log_Aug95.gz 
    !mv ./NASA_access_log_Aug95.gz ./data/

if not os.path.exists("./data/clarknet_access_log_Aug28.gz"):
    !wget ftp://ita.ee.lbl.gov/traces/clarknet_access_log_Aug28.gz 
    !mv ./clarknet_access_log_Aug28.gz ./data/

if not os.path.exists("./data/clarknet_access_log_Sep4.gz"):
    !wget ftp://ita.ee.lbl.gov/traces/clarknet_access_log_Sep4.gz
    !mv ./clarknet_access_log_Sep4.gz ./data/

--2020-04-15 15:12:59--  ftp://ita.ee.lbl.gov/traces/clarknet_access_log_Aug28.gz
           => ‘clarknet_access_log_Aug28.gz’
Resolving ita.ee.lbl.gov (ita.ee.lbl.gov)... 131.243.2.164
Connecting to ita.ee.lbl.gov (ita.ee.lbl.gov)|131.243.2.164|:21... connected.
Logging in as anonymous ...
Login incorrect.
mv: rename ./clarknet_access_log_Aug28.gz to ./data/clarknet_access_log_Aug28.gz: No such file or directory
--2020-04-15 15:12:59--  ftp://ita.ee.lbl.gov/traces/clarknet_access_log_Sep4.gz
           => ‘clarknet_access_log_Sep4.gz’
Resolving ita.ee.lbl.gov (ita.ee.lbl.gov)... 131.243.2.164
Connecting to ita.ee.lbl.gov (ita.ee.lbl.gov)|131.243.2.164|:21... connected.
Logging in as anonymous ...
Login incorrect.
mv: rename ./clarknet_access_log_Sep4.gz to ./data/clarknet_access_log_Sep4.gz: No such file or directory


## Parse Nasa Log files

In [6]:
import glob
import os

raw_data_files = glob.glob('./data/*.gz')
raw_data_files

['./data/NASA_access_log_Jul95.gz', './data/NASA_access_log_Aug95.gz']

In [7]:
base_df = spark.read.text(raw_data_files)
base_df.printSchema()

root
 |-- value: string (nullable = true)



In [8]:
type(base_df)

pyspark.sql.dataframe.DataFrame

In [9]:
os.environ["SPARK_HOME"] = "/opt/spark-2.4.5-bin-hadoop2.7"
#os.environ["PYSPARK_PYTHON"]="/usr/local/bin/python3"
print(os.environ.get('SPARK_HOME'))
print(os.environ.get('PATH'))
base_df_rdd = base_df.rdd
type(base_df_rdd)

/opt/spark-2.4.5-bin-hadoop2.7
/Users/rojan/Kathmandu/CodeSpace/Github/SparkTutorial/pyspark_venv/bin:/usr/local/bin:/usr/bin:/Users/rojan/anaconda3/bin:/Users/rojan/anaconda3/condabin:/Users/rojan/.rvm/gems/ruby-2.6.1/bin:/Users/rojan/.rvm/gems/ruby-2.6.1@global/bin:/Users/rojan/.rvm/rubies/ruby-2.6.1/bin:/opt/spark-2.4.5-bin-hadoop2.7/bin:/Users/rojan/.gem/ruby/2.3.0/bin:/Users/rojan/Rojan/Work/Codes/nsSNP/scripts:/Users/rojan/Rojan/Work/AlbertEinstein/ProgramDevelopment/pphore:/Users/rojan/Rojan/Work/AlbertEinstein/ProgramDevelopment/pharmacophore2pdb:/Users/rojan/Rojan/Work/AlbertEinstein/ProgramDevelopment/compare_two_vectors_correl:/Users/rojan/Rojan/Work/AlbertEinstein/ProgramDevelopment/MDElapsedTime:/Users/rojan/Rojan/Work/AlbertEinstein/ProgramDevelopment/MDTrj:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/opt/X11/bin:/Users/rojan/Kathmandu/Work/DataScience/softwares/bowtie2-2.3.4.3-macos-x86_64:/usr/local/Cellar/gcc/8.2.0/bin:/Users/rojan/.rvm/bin:/Users/rojan/Kathmandu/Code

pyspark.rdd.RDD

In [10]:
base_df.show(2, truncate=False)

+-------------------------------------------------------------------------------------------------+
|value                                                                                            |
+-------------------------------------------------------------------------------------------------+
|199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245           |
|unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985|
+-------------------------------------------------------------------------------------------------+
only showing top 2 rows



In [11]:
# !pip3 install findspark
# os.environ.get('SPARK_HOME')
import findspark
findspark.init()

In [12]:
base_df_rdd.take(2)

[Row(value='199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245'),
 Row(value='unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985')]

### Data Wrangling

In [13]:
sample_logs = [item['value'] for item in base_df.take(2)]
sample_logs

['199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245',
 'unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985']

## Extracting host names

In [14]:
host_pattern = r'(^\S+\.[\S+\.]+\S+)\s'
hosts = [re.search(host_pattern, item).group(1)
           if re.search(host_pattern, item)
           else 'no match'
           for item in sample_logs]
hosts

['199.72.81.55', 'unicomp6.unicomp.net']

## Extracting timestamps

In [15]:
ts_pattern = r'\[(\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]'
timestamps = [re.search(ts_pattern, item).group(1) for item in sample_logs]
timestamps

['01/Jul/1995:00:00:01 -0400', '01/Jul/1995:00:00:06 -0400']

## Extracting HTTP Request Method, URIs and Protocol

In [16]:
method_uri_protocol_pattern = r'\"(\S+)\s(\S+)\s*(\S*)\"'
method_uri_protocol = [re.search(method_uri_protocol_pattern, item).groups()
               if re.search(method_uri_protocol_pattern, item)
               else 'no match'
              for item in sample_logs]
method_uri_protocol

[('GET', '/history/apollo/', 'HTTP/1.0'),
 ('GET', '/shuttle/countdown/', 'HTTP/1.0')]

## Extracting HTTP Status Codes

In [17]:
status_pattern = r'\s(\d{3})\s'
status = [re.search(status_pattern, item).group(1) for item in sample_logs]
status

['200', '200']

## Extracting HTTP Response Content Size

In [18]:
content_size_pattern = r'\s(\d+)$'
content_size = [re.search(content_size_pattern, item).group(1) for item in sample_logs]
print(content_size)

['6245', '3985']


In [19]:
from pyspark.sql.functions import regexp_extract

logs_df = base_df.select(regexp_extract('value', host_pattern, 1).alias('host'),
                         regexp_extract('value', ts_pattern, 1).alias('timestamp'),
                         regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'),
                         regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'),
                         regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'),
                         regexp_extract('value', status_pattern, 1).cast('integer').alias('status'),
                         regexp_extract('value', content_size_pattern, 1).cast('integer').alias('content_size'))
logs_df.show(2, truncate=True)
print((logs_df.count, len(logs_df.columns)))

+--------------------+--------------------+------+-------------------+--------+------+------------+
|                host|           timestamp|method|           endpoint|protocol|status|content_size|
+--------------------+--------------------+------+-------------------+--------+------+------------+
|        199.72.81.55|01/Jul/1995:00:00...|   GET|   /history/apollo/|HTTP/1.0|   200|        6245|
|unicomp6.unicomp.net|01/Jul/1995:00:00...|   GET|/shuttle/countdown/|HTTP/1.0|   200|        3985|
+--------------------+--------------------+------+-------------------+--------+------+------------+
only showing top 2 rows

(<bound method DataFrame.count of DataFrame[host: string, timestamp: string, method: string, endpoint: string, protocol: string, status: int, content_size: int]>, 7)


## Finding Missing Values

In [20]:
base_df.filter(base_df['value'].isNull())

DataFrame[value: string]

In [21]:
bad_rows_df = logs_df.filter(logs_df['host'].isNull()| 
                             logs_df['timestamp'].isNull() | 
                             logs_df['method'].isNull() |
                             logs_df['endpoint'].isNull() |
                             logs_df['status'].isNull() |
                             logs_df['content_size'].isNull()|
                             logs_df['protocol'].isNull())

In [22]:
bad_rows_df.show(2)

+--------------------+--------------------+------+--------------------+--------+------+------------+
|                host|           timestamp|method|            endpoint|protocol|status|content_size|
+--------------------+--------------------+------+--------------------+--------+------+------------+
|dd15-062.compuser...|01/Jul/1995:00:01...|   GET|/news/sci.space.s...|HTTP/1.0|   404|        null|
|     dynip42.efn.org|01/Jul/1995:00:02...|   GET|           /software|HTTP/1.0|   302|        null|
+--------------------+--------------------+------+--------------------+--------+------+------------+
only showing top 2 rows



In [23]:
bad_rows_df.columns

['host',
 'timestamp',
 'method',
 'endpoint',
 'protocol',
 'status',
 'content_size']

In [24]:
from pyspark.sql.functions import col
from pyspark.sql.functions import sum as spark_sum

def count_null(col_name):
    return spark_sum(col(col_name).isNull().cast('integer')).alias(col_name)

# Build up a list of column expressions, one per column.
exprs = [count_null(col_name) for col_name in logs_df.columns]

# Run the aggregation. The *exprs converts the list of expressions into
# variable function arguments.
logs_df.agg(*exprs).show()

+----+---------+------+--------+--------+------+------------+
|host|timestamp|method|endpoint|protocol|status|content_size|
+----+---------+------+--------+--------+------+------------+
|   0|        0|     0|       0|       0|     1|       33905|
+----+---------+------+--------+--------+------+------------+



In [25]:
null_status_df = base_df.filter(~base_df['value'].rlike(r'\s(\d{3})\s'))
null_status_df.show(1)

+--------+
|   value|
+--------+
|alyssa.p|
+--------+



In [26]:
bad_status_df = null_status_df.select(regexp_extract('value', host_pattern, 1).alias('host'),
                                      regexp_extract('value', ts_pattern, 1).alias('timestamp'),
                                      regexp_extract('value', method_uri_protocol_pattern, 1).alias('method'),
                                      regexp_extract('value', method_uri_protocol_pattern, 2).alias('endpoint'),
                                      regexp_extract('value', method_uri_protocol_pattern, 3).alias('protocol'),
                                      regexp_extract('value', status_pattern, 1).cast('integer').alias('status'),
                                      regexp_extract('value', content_size_pattern, 1).cast('integer').alias('content_size'))
bad_status_df.show(truncate=False)

+----+---------+------+--------+--------+------+------------+
|host|timestamp|method|endpoint|protocol|status|content_size|
+----+---------+------+--------+--------+------+------------+
|    |         |      |        |        |null  |null        |
+----+---------+------+--------+--------+------+------------+



In [27]:
bad_status_df.show(2)

+----+---------+------+--------+--------+------+------------+
|host|timestamp|method|endpoint|protocol|status|content_size|
+----+---------+------+--------+--------+------+------------+
|    |         |      |        |        |  null|        null|
+----+---------+------+--------+--------+------+------------+



In [28]:
logs_df.show(2)


+--------------------+--------------------+------+-------------------+--------+------+------------+
|                host|           timestamp|method|           endpoint|protocol|status|content_size|
+--------------------+--------------------+------+-------------------+--------+------+------------+
|        199.72.81.55|01/Jul/1995:00:00...|   GET|   /history/apollo/|HTTP/1.0|   200|        6245|
|unicomp6.unicomp.net|01/Jul/1995:00:00...|   GET|/shuttle/countdown/|HTTP/1.0|   200|        3985|
+--------------------+--------------------+------+-------------------+--------+------+------------+
only showing top 2 rows



In [29]:
logs_df = logs_df[logs_df['status'].isNotNull()]
logs_df.show(10)

+--------------------+--------------------+------+--------------------+--------+------+------------+
|                host|           timestamp|method|            endpoint|protocol|status|content_size|
+--------------------+--------------------+------+--------------------+--------+------+------------+
|        199.72.81.55|01/Jul/1995:00:00...|   GET|    /history/apollo/|HTTP/1.0|   200|        6245|
|unicomp6.unicomp.net|01/Jul/1995:00:00...|   GET| /shuttle/countdown/|HTTP/1.0|   200|        3985|
|      199.120.110.21|01/Jul/1995:00:00...|   GET|/shuttle/missions...|HTTP/1.0|   200|        4085|
|  burger.letters.com|01/Jul/1995:00:00...|   GET|/shuttle/countdow...|HTTP/1.0|   304|           0|
|      199.120.110.21|01/Jul/1995:00:00...|   GET|/shuttle/missions...|HTTP/1.0|   200|        4179|
|  burger.letters.com|01/Jul/1995:00:00...|   GET|/images/NASA-logo...|HTTP/1.0|   304|           0|
|  burger.letters.com|01/Jul/1995:00:00...|   GET|/shuttle/countdow...|HTTP/1.0|   200|    

In [30]:
exprs = [count_null(col_name) for col_name in logs_df.columns]
logs_df.agg(*exprs).show(2)

+----+---------+------+--------+--------+------+------------+
|host|timestamp|method|endpoint|protocol|status|content_size|
+----+---------+------+--------+--------+------+------------+
|   0|        0|     0|       0|       0|     0|       33904|
+----+---------+------+--------+--------+------+------------+



## Handling nulls in HTTP content size

In [31]:
null_content_size_df = base_df.filter(~base_df['value'].rlike(r'\s\d+$'))
null_content_size_df.show(1, truncate=True)

+--------------------+
|               value|
+--------------------+
|dd15-062.compuser...|
+--------------------+
only showing top 1 row



In [32]:
null_content_size_df.take(2)

[Row(value='dd15-062.compuserve.com - - [01/Jul/1995:00:01:12 -0400] "GET /news/sci.space.shuttle/archive/sci-space-shuttle-22-apr-1995-40.txt HTTP/1.0" 404 -'),
 Row(value='dynip42.efn.org - - [01/Jul/1995:00:02:14 -0400] "GET /software HTTP/1.0" 302 -')]

In [33]:
logs_df = logs_df.na.fill({'content_size': 0})

In [34]:
exprs = [count_null(col_name) for col_name in logs_df.columns]
logs_df.agg(*exprs).show()

+----+---------+------+--------+--------+------+------------+
|host|timestamp|method|endpoint|protocol|status|content_size|
+----+---------+------+--------+--------+------+------------+
|   0|        0|     0|       0|       0|     0|           0|
+----+---------+------+--------+--------+------+------------+



## Handling Temporal Fields (Timestamp)

In [35]:
from pyspark.sql.functions import udf
from pyspark.sql.types import TimestampType
from datetime import datetime

month_str_ints = {'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4 , 'May': 5, 'Jun':6, 
             'Jul': 7, 'Aug': 8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12 }

def parse_clf_time(text):
    print(text)
    # NOTE: We're ignoring the time zones here, might need to be handled depending on the problem you are solving

    # datetime.strptime(datetime_str, '%m/%d/%y %H:%M:%S')
    
    a = "{0:04d}/{1:02d}/{2:02d} {3:02d}:{4:02d}:{5:02d}".format(
      int(text[7:11]),
      month_str_ints[text[3:6]],
      int(text[0:2]),
      int(text[12:14]),
      int(text[15:17]),
      int(text[18:20]))
    return a
  

In [36]:
logs_df.printSchema()

root
 |-- host: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- method: string (nullable = true)
 |-- endpoint: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- content_size: integer (nullable = false)



In [37]:
sample_ts = [item['host'] for item in logs_df.select('host').take(2)]
sample_ts

['199.72.81.55', 'unicomp6.unicomp.net']

In [38]:
sample_ts = [item['timestamp'] for item in logs_df.select('timestamp').take(2)]
sample_ts

['01/Jul/1995:00:00:01 -0400', '01/Jul/1995:00:00:06 -0400']

In [39]:
d1 = logs_df

In [40]:
[parse_clf_time(item) for item in sample_ts]

01/Jul/1995:00:00:01 -0400
01/Jul/1995:00:00:06 -0400


['1995/07/01 00:00:01', '1995/07/01 00:00:06']

In [41]:
logs_df.show(2)

+--------------------+--------------------+------+-------------------+--------+------+------------+
|                host|           timestamp|method|           endpoint|protocol|status|content_size|
+--------------------+--------------------+------+-------------------+--------+------+------------+
|        199.72.81.55|01/Jul/1995:00:00...|   GET|   /history/apollo/|HTTP/1.0|   200|        6245|
|unicomp6.unicomp.net|01/Jul/1995:00:00...|   GET|/shuttle/countdown/|HTTP/1.0|   200|        3985|
+--------------------+--------------------+------+-------------------+--------+------+------------+
only showing top 2 rows



In [42]:
logs_df.printSchema()

root
 |-- host: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- method: string (nullable = true)
 |-- endpoint: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- content_size: integer (nullable = false)



In [48]:
from pyspark.sql.functions import unix_timestamp
udf_parse_time = udf(parse_clf_time)
a = logs_df.select('*', unix_timestamp(udf_parse_time(logs_df['timestamp']), "yyyy/MM/dd HH:mm:ss")
                                .cast(TimestampType())
                                .alias('time')).drop('timestamp')

In [49]:
a.show()

+--------------------+------+--------------------+--------+------+------------+-------------------+
|                host|method|            endpoint|protocol|status|content_size|               time|
+--------------------+------+--------------------+--------+------+------------+-------------------+
|        199.72.81.55|   GET|    /history/apollo/|HTTP/1.0|   200|        6245|1995-07-01 00:00:01|
|unicomp6.unicomp.net|   GET| /shuttle/countdown/|HTTP/1.0|   200|        3985|1995-07-01 00:00:06|
|      199.120.110.21|   GET|/shuttle/missions...|HTTP/1.0|   200|        4085|1995-07-01 00:00:09|
|  burger.letters.com|   GET|/shuttle/countdow...|HTTP/1.0|   304|           0|1995-07-01 00:00:11|
|      199.120.110.21|   GET|/shuttle/missions...|HTTP/1.0|   200|        4179|1995-07-01 00:00:11|
|  burger.letters.com|   GET|/images/NASA-logo...|HTTP/1.0|   304|           0|1995-07-01 00:00:12|
|  burger.letters.com|   GET|/shuttle/countdow...|HTTP/1.0|   200|           0|1995-07-01 00:00:12|


In [50]:
logs_df.show()

+--------------------+--------------------+------+--------------------+--------+------+------------+
|                host|           timestamp|method|            endpoint|protocol|status|content_size|
+--------------------+--------------------+------+--------------------+--------+------+------------+
|        199.72.81.55|01/Jul/1995:00:00...|   GET|    /history/apollo/|HTTP/1.0|   200|        6245|
|unicomp6.unicomp.net|01/Jul/1995:00:00...|   GET| /shuttle/countdown/|HTTP/1.0|   200|        3985|
|      199.120.110.21|01/Jul/1995:00:00...|   GET|/shuttle/missions...|HTTP/1.0|   200|        4085|
|  burger.letters.com|01/Jul/1995:00:00...|   GET|/shuttle/countdow...|HTTP/1.0|   304|           0|
|      199.120.110.21|01/Jul/1995:00:00...|   GET|/shuttle/missions...|HTTP/1.0|   200|        4179|
|  burger.letters.com|01/Jul/1995:00:00...|   GET|/images/NASA-logo...|HTTP/1.0|   304|           0|
|  burger.letters.com|01/Jul/1995:00:00...|   GET|/shuttle/countdow...|HTTP/1.0|   200|    

In [51]:
a.printSchema()

root
 |-- host: string (nullable = true)
 |-- method: string (nullable = true)
 |-- endpoint: string (nullable = true)
 |-- protocol: string (nullable = true)
 |-- status: integer (nullable = true)
 |-- content_size: integer (nullable = false)
 |-- time: timestamp (nullable = true)



In [52]:
a.show()

+--------------------+------+--------------------+--------+------+------------+-------------------+
|                host|method|            endpoint|protocol|status|content_size|               time|
+--------------------+------+--------------------+--------+------+------------+-------------------+
|        199.72.81.55|   GET|    /history/apollo/|HTTP/1.0|   200|        6245|1995-07-01 00:00:01|
|unicomp6.unicomp.net|   GET| /shuttle/countdown/|HTTP/1.0|   200|        3985|1995-07-01 00:00:06|
|      199.120.110.21|   GET|/shuttle/missions...|HTTP/1.0|   200|        4085|1995-07-01 00:00:09|
|  burger.letters.com|   GET|/shuttle/countdow...|HTTP/1.0|   304|           0|1995-07-01 00:00:11|
|      199.120.110.21|   GET|/shuttle/missions...|HTTP/1.0|   200|        4179|1995-07-01 00:00:11|
|  burger.letters.com|   GET|/images/NASA-logo...|HTTP/1.0|   304|           0|1995-07-01 00:00:12|
|  burger.letters.com|   GET|/shuttle/countdow...|HTTP/1.0|   200|           0|1995-07-01 00:00:12|


# Explortory Data Analysis (EDA)

## HTTP Status
First, unique HTTP status is determined and then find their distribution.



In [53]:
df_status_freq = (logs_df
                     .groupBy('status')
                     .count()
                     .sort('status')
                     .cache())

In [67]:
df_pd_status_freq = (df_status_freq
                         .toPandas()
                         .sort_values(by=['count'], ascending=False))

df_pd_status_freq['percent'] = df_pd_status_freq['count']/sum(df_pd_status_freq['count'])
df_pd_status_freq['log(count)'] = np.log(df_pd_status_freq['count'])
df_pd_status_freq


Unnamed: 0,status,count,percent,log(count)
0,200,3100524,0.895688,14.947082
2,304,266773,0.077066,12.494153
1,302,73070,0.021109,11.199173
5,404,20899,0.006037,9.947457
4,403,225,6.5e-05,5.4161
6,500,65,1.9e-05,4.174387
7,501,41,1.2e-05,3.713572
3,400,15,4e-06,2.70805


In [65]:
df_pd_status_freq.head()

Unnamed: 0,status,count,percent
0,200,3100524,0.895688
2,304,266773,0.077066
1,302,73070,0.021109
5,404,20899,0.006037
4,403,225,6.5e-05


## Frequent host

Identify the top ten host and also compute their proportion.

In [69]:
logs_df.show(2)

+--------------------+--------------------+------+-------------------+--------+------+------------+
|                host|           timestamp|method|           endpoint|protocol|status|content_size|
+--------------------+--------------------+------+-------------------+--------+------+------------+
|        199.72.81.55|01/Jul/1995:00:00...|   GET|   /history/apollo/|HTTP/1.0|   200|        6245|
|unicomp6.unicomp.net|01/Jul/1995:00:00...|   GET|/shuttle/countdown/|HTTP/1.0|   200|        3985|
+--------------------+--------------------+------+-------------------+--------+------+------------+
only showing top 2 rows



In [85]:
df_host_freq = (logs_df
                     .groupBy('host')
                     .count()
                     .cache())
df_pd_host_freq = (df_host_freq
                        .toPandas()
                        .sort_values(by=['count'], ascending=False))
df_pd_host_freq["Percentage"] = df_pd_host_freq["count"] / sum(df_pd_host_freq["count"])
df_pd_host_freq["Log(count)"] = np.log(df_pd_host_freq["count"])

In [86]:
df_pd_host_freq.head(3)

Unnamed: 0,host,count,Percentage,Log(count)
116548,piweba3y.prodigy.com,21988,0.006352,9.998252
124250,piweba4y.prodigy.com,16437,0.004748,9.70729
40617,piweba1y.prodigy.com,12825,0.003705,9.459152


In [91]:
mean    = np.mean(df_pd_host_freq['count'])
std     = np.std(df_pd_host_freq['count'])
df_pd_host_freq['zscore'] = (df_pd_host_freq['count'] - mean)/std
df_pd_host_freq.head()

Unnamed: 0,host,count,Percentage,Log(count),zscore
116548,piweba3y.prodigy.com,21988,0.006352,9.998252,144.486587
124250,piweba4y.prodigy.com,16437,0.004748,9.70729,107.968417
40617,piweba1y.prodigy.com,12825,0.003705,9.459152,84.206279
87525,edams.ksc.nasa.gov,11964,0.003456,9.389657,78.542048
52461,163.206.89.4,9697,0.002801,9.179572,63.628215


## What method is most frequently used?

In [100]:
df_method = (logs_df
                     .groupBy('method')
                     .count()
                     .cache())
df_pd_method = (df_method
                        .toPandas()
                        .sort_values(by=['count'], ascending=False))
df_pd_method["Percentage"] = 100.00 * (df_pd_method["count"] / sum(df_pd_method["count"]))
df_pd_method["Log(count)"] = np.log(df_pd_method["count"])

In [101]:
df_pd_method.head()

Unnamed: 0,method,count,Percentage,Log(count)
3,GET,3451720,99.714237,15.054383
2,HEAD,7915,0.228651,8.976515
4,,1753,0.050641,7.469084
0,POST,222,0.006413,5.402677
1,�|t�9ð'À|u,2,5.8e-05,0.693147


Most popular method is GET followed by HEAD. However, 99% of times GET method is used. Superisingly, POST method is almost never used.  

In [102]:
df_protocol = (logs_df
                     .groupBy('protocol')
                     .count()
                     .cache())
df_protocol = (df_protocol
                        .toPandas()
                        .sort_values(by=['count'], ascending=False))
df_protocol["Percentage"] = 100.00 * (df_protocol["count"] / sum(df_protocol["count"]))
df_protocol["Log(count)"] = np.log(df_protocol["count"])

In [103]:
df_protocol.head()

Unnamed: 0,protocol,count,Percentage,Log(count)
4,HTTP/1.0,3454716,99.800786,15.055251
2,,6599,0.190634,8.794673
3,HTTP/V1.0,279,0.00806,5.631212
0,HTTP/*,13,0.000376,2.564949
5,STS-69</a><p>,4,0.000116,1.386294
