In [151]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.random import RandomRDDs

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col,udf,when
from pyspark.sql import Window

from pyspark.sql.types import StringType,IntegerType

from urllib.parse import urlparse
import gc
import os
import pandas as pd
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

spark.conf.set("spark.sql.caseSensitive", "true")

In [10]:
df = spark.read.json('/Users/amirrahmani/Downloads/7park/2016/09/01/00/US/CR/*')
df.registerTempTable("df")
df.printSchema()


root
 |-- request: struct (nullable = true)
 |    |-- documentReferer: string (nullable = true)
 |    |-- error: string (nullable = true)
 |    |-- eventId: string (nullable = true)
 |    |-- frameId: long (nullable = true)
 |    |-- fromCache: boolean (nullable = true)
 |    |-- ip: string (nullable = true)
 |    |-- is_online: boolean (nullable = true)
 |    |-- mainFrameRequestId: string (nullable = true)
 |    |-- method: string (nullable = true)
 |    |-- navigationId: long (nullable = true)
 |    |-- openerTabId: long (nullable = true)
 |    |-- parentFrameId: long (nullable = true)
 |    |-- redirectUrl: string (nullable = true)
 |    |-- requestHeaders: struct (nullable = true)
 |    |    |-- Accept: string (nullable = true)
 |    |    |-- Accept-Encoding: string (nullable = true)
 |    |    |-- Accept-Language: string (nullable = true)
 |    |    |-- Avail-Dictionary: string (nullable = true)
 |    |    |-- Content-Type: string (nullable = true)
 |    |    |-- DNT: string (nul

In [310]:
dg = sqlContext.sql("""
    SELECT request.*,request.requestHeaders.*,request.responseHeaders.*,server_request.*,server_request.user_map.*,
    request.requestHeaders.`Content-Type` as requestHeaders_content_type,request.responseHeaders.`Content-Type` as responseHeader_content_type,
    request.requestHeaders.Origin as requestHeaders_Origin,request.responseHeaders.Origin as responseHeader_Origin,
    request.requestHeaders.Referer as requestHeaders_Referer,request.responseHeaders.Referer as responseHeader_Referer,
    request.requestHeaders.`User-Agent` as requestHeaders_User_Agent,request.responseHeaders.`User-Agent` as responseHeader_User_Agent
    FROM df
""").drop('requestHeaders'
         ).drop('responseHeaders'
               ).drop('server_request'
                     ).drop('user_map'
                           ).drop('Content-Type'
                                 ).drop('Origin'
                                       ).drop('Referer'
                                             ).drop('User-Agent'
                                                   )


print('Number of rows: {:d} \nNumber of columns: {:d}'.format(dg.count(),len(dg.columns)))

Number of rows: 12130 
Number of columns: 586


### Download the data base from http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.tar.gz

In [311]:
def ip_to_zip(ip):
    from geolite2 import geolite2
    if ip == None: return None
    reader = geolite2.reader()
    ip_location = reader.get(ip)
    if ip_location and 'postal' in ip_location.keys():
        try:
            return int(ip_location['postal']['code'])
        except:
            return None
    else:
        return None
    

In [312]:
ip_to_zip_udf = udf(ip_to_zip,IntegerType())
dg = dg.withColumn('zipcode',when(dg.ip.isNotNull(),ip_to_zip_udf(dg.ip)).otherwise(dg.ip))


In [345]:
x = 'requestId'
print(dg.filter((col(x).isNull()) ).count()
#       ,dg.filter((col(x)=='POST')).count()
      ,dg.select(x).distinct().count()
     )


dg.select(x).distinct().show()
dg.select(x).show(20,False)

11316 722
+---------+
|requestId|
+---------+
|   154770|
|   155038|
|   144656|
|     4032|
|   151794|
|   149803|
|   155978|
|   151757|
|   153905|
|   152798|
|   154726|
|   145031|
|   146640|
|   146829|
|   152779|
|   144636|
|   147599|
|   156289|
|   155721|
|   151838|
+---------+
only showing top 20 rows

+---------+
|requestId|
+---------+
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
|null     |
+---------+
only showing top 20 rows



In [313]:
for x,y in dg.dtypes:
    print(x,y)

documentReferer string
error string
eventId string
frameId bigint
fromCache boolean
ip string
is_online boolean
mainFrameRequestId string
method string
navigationId bigint
openerTabId bigint
parentFrameId bigint
redirectUrl string
requestId string
requestType string
statusCode bigint
statusLine string
tabId bigint
timeStamp double
type string
url string
windowName string
windowTitle string
Accept string
Accept-Encoding string
Accept-Language string
Avail-Dictionary string
DNT string
Save-Data string
Upgrade-Insecure-Requests string
X-Chrome-Connected string
X-Chrome-UMA-Enabled string
X-Client-Data string
X-Requested-With string
X-Rlz-String string
YouTube-Safety-Mode string
AMServer string
ANServer string
Accept-Ranges string
Access-Control-Allow-Credentials string
Access-Control-Allow-Headers string
Access-Control-Allow-Methods string
Access-Control-Allow-Origin string
Access-Control-Expose-Headers string
Access-Control-Max-Age string
Access-Control-Request-Headers string
Age string


puma string
tiger string
requestHeaders_content_type string
responseHeader_content_type string
requestHeaders_Origin string
responseHeader_Origin string
requestHeaders_Referer string
responseHeader_Referer string
requestHeaders_User_Agent string
responseHeader_User_Agent string
zipcode string


In [None]:


def url_parser(url):
    return urlparse(url).hostname

url_parser_udf = udf(url_parser,StringType())
dg = dg.withColumn('url',url_parser_udf(dg.documentReferer))
dg = dg.withColumn('url_redirect',url_parser_udf(dg.redirectUrl))
dg = dg.withColumn('url_redirect',url_parser_udf(dg.redirectUrl))
dg.withColumn('url_match',when(df.url==df.url_redirect,1).otherwise(0))

In [44]:
w = Window.partitionBy('ip')
dg = dg.withColumn('ip_count',F.count(dg.ip).over(w))

In [15]:
c = dg.count()
null_drop = []
distinct_high = []
col_number = 0
for X,y in dg.dtypes:
    col_number += 1
    x = '`' + X + '`'
    null_count = dg.filter((col(x).isNull()) | (col(x) == '')).count()
    if y == 'string':
        distinct_count = dg.select(x).distinct().count()
        if distinct_count > 100:
            distinct_high.append(x)
    else:
        distinct_count = 0
    if null_count*1./c > 0.2:
        null_drop.append(x)
    print('{:3d} {:40s} {:8d} {:6.2f}%, {:8d}'.format(col_number,X,null_count,null_count*100./c,distinct_count))

documentReferer                              3767 (31.06%),      756
error                                       12082 (99.60%),        2
eventId                                       864 ( 7.12%),    10951
frameId                                        50 ( 0.41%),        0
fromCache                                      50 ( 0.41%),        0
ip                                            613 ( 5.05%),     1738
is_online                                   12082 (99.60%),        0
mainFrameRequestId                          12080 (99.59%),        5
method                                         50 ( 0.41%),        3
navigationId                                  914 ( 7.54%),        0
openerTabId                                 12118 (99.90%),        0
parentFrameId                                  50 ( 0.41%),        0
redirectUrl                                 10475 (86.36%),     1094
requestId                                   11316 (93.29%),      722
requestType                       

Rtss                                        12125 (99.96%),        4
SERVERID                                    12115 (99.88%),        4
SET-COOKIE                                  12096 (99.72%),       35
SPIisLatency                                12128 (99.98%),        2
SPRequestDuration                           12128 (99.98%),        3
SPRequestGuid                               12128 (99.98%),        3
Server                                       6041 (49.80%),      178
Set-cookie                                  12116 (99.88%),       10
Smug-CDN                                    12127 (99.98%),        3
Status                                      12103 (99.78%),        4
Strict-Transport-Security                   11796 (97.25%),       27
Surrogate-Key                               12082 (99.60%),        2
Timing-Allow-Origin                         12020 (99.09%),        3
Transfer-Encoding                           10772 (88.80%),        2
Transfer-encoding                 

X-GateIds                                   12128 (99.98%),        2
X-Goblin-Server                             12129 (99.99%),        2
X-HTML-Minification-Powered-By              12129 (99.99%),        2
X-HW                                        12059 (99.41%),       22
X-Hit                                       12126 (99.97%),        2
X-Host                                      12080 (99.59%),        4
X-Hostname                                  12127 (99.98%),        4
X-Identity                                  12129 (99.99%),        2
X-Iinfo                                     12126 (99.97%),        5
X-Instart-CacheKeyMod                       12119 (99.91%),        2
X-Instart-Request-ID                        12118 (99.90%),       13
X-Instart-Streaming                         12119 (99.91%),        4
X-Key                                       12128 (99.98%),        3
X-Language                                  12128 (99.98%),        2
X-Lzd-App-Name                    

X-Unbounce-PageId                           12129 (99.99%),        2
X-Unbounce-Variant                          12129 (99.99%),        2
X-Unbounce-VisitorID                        12129 (99.99%),        2
X-Upstream                                  12129 (99.99%),        2
X-User-Agent                                12127 (99.98%),        2
X-UserSessionId                             12124 (99.95%),        7
X-VIA-AKAMAI                                12121 (99.93%),        2
X-VServer                                   12126 (99.97%),        5
X-Varnish                                   12078 (99.57%),       52
X-Varnish-Cache                             12122 (99.93%),        4
X-Varnish-Cache-Hits                        12127 (99.98%),        2
X-Varnish-Grace                             12127 (99.98%),        2
X-Varnish-RemainingGrace                    12127 (99.98%),        2
X-Varnish-RemainingTTL                      12127 (99.98%),        2
X-Varnish-Seen-By                 

x-analytics                                 12100 (99.75%),       20
x-apm-hostname                              12129 (99.99%),        2
x-aspnet-version                            12129 (99.99%),        2
x-atlas                                     12129 (99.99%),        2
x-auto-login                                12124 (99.95%),        4
x-blog-page                                 12129 (99.99%),        2
x-buzzfeed                                  12127 (99.98%),        4
x-buzzfeed-debug                            12127 (99.98%),        3
x-cache                                     12040 (99.26%),       33
x-cache-hits                                12127 (99.98%),        2
x-cache-status                              12100 (99.75%),        3
x-cached                                    12128 (99.98%),        2
x-ccc                                       12129 (99.99%),        2
x-chromium-appcache-fallback-override       12128 (99.98%),        2
x-cid                             

In [267]:
keep_cols = []
for x in dg.columns:
    x = '`' + x + '`'
    if x not in null_drop:
        keep_cols.append(x)
        
dg = dg.select(keep_cols)

In [268]:
dg.columns

['eventId',
 'frameId',
 'fromCache',
 'ip',
 'method',
 'navigationId',
 'parentFrameId',
 'requestType',
 'statusCode',
 'statusLine',
 'tabId',
 'timeStamp',
 'type',
 'url',
 'windowName',
 'windowTitle',
 'Accept',
 'Accept-Encoding',
 'Accept-Language',
 'Upgrade-Insecure-Requests',
 'accept_language',
 'country_code',
 'partner_id',
 'request_unixtime',
 'software_id',
 'user_agent',
 'user_guid',
 'x_forwarded_for',
 'requestHeaders_Referer',
 'requestHeaders_User_Agent']

In [57]:
from pyspark.sql import types as t

dg = dg.withColumn("TimestampType",F.to_timestamp(dg["request_unixtime"]))

In [58]:
def rdd_map_t_row(one_row,model_lr):

    # one_row is of type lines type=<class 'pyspark.sql.types.Row'>
    row_dict = one_row.asDict()
    pd_df = pd.DataFrame([row_dict]) # has to be a list object
    print("ONE:ROW:now with PANDAS DF={}".format(pd_df))
    try:
        y_pred = model_lr.predict(pd_df)
        print("ONE:ROW:PD_DF_Y_PRED={}".format(y_pred))
    except:
        print("error:MODEL:PREDICT")

    ret_row = one_row # access the elements like one_row.col1, one_row.col2  where col1 is the name of the column
#     try:
#         # cannto do spark.createDataFrame(pd_df)  because we have to return the rdd.row object
#         ret_pd_dict = pd_df.to_dict(orient='records')
#         print("ONE:ROW:PD:DICT=type={0}::dict={1}".format(type(ret_pd_dict),ret_pd_dict))
#         ret_pd_dict = ret_pd_dict[1] # get the SECOND because first has header --first and the only element we need since we create 1 row
#         ret_row = Row(**ret_pd_dict)
#     except:
#         print("error:SPARK:PD:TO:DF")
    ret_pd_dict = pd_df.to_dict(orient='records')
    print("ONE:ROW:PD:DICT=type={0}::dict={1}".format(type(ret_pd_dict),ret_pd_dict))
    print(ret_pd_dict)
    ret_pd_dict = ret_pd_dict[0] # get the SECOND because first has header --first and the only element we need since we create 1 row
#     ret_row = Row(**ret_pd_dict)
    return ret_row

In [47]:
new_rdd = df.rdd.map(lambda x: rdd_map_t_row(x,model_lr))

In [59]:
x = rdd_map_t_row(df.rdd.first(),model_lr)

ONE:ROW:now with PANDAS DF=         _1        _2        _3        _4
0  0.851571  0.124906  0.667121  0.354662
ONE:ROW:PD_DF_Y_PRED=[9.56496148]
ONE:ROW:PD:DICT=type=<class 'list'>::dict=[{'_1': 0.8515712526498084, '_2': 0.12490555818176752, '_3': 0.6671208952627372, '_4': 0.35466243926679875}]
[{'_1': 0.8515712526498084, '_2': 0.12490555818176752, '_3': 0.6671208952627372, '_4': 0.35466243926679875}]


In [60]:
x

Row(_1=0.8515712526498084, _2=0.12490555818176752, _3=0.6671208952627372, _4=0.35466243926679875)

In [49]:
new_rdd.first()

Row(_1=0.8515712526498084, _2=0.12490555818176752, _3=0.6671208952627372, _4=0.35466243926679875)

In [42]:
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1,1,1], [1, 2,2,3], [2, 2,4,5], [2, 3,6,7]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2,5,6])) + 3
model_lr = LinearRegression().fit(X, y)

In [43]:
model_lr.predict(pandas_df)

array([ 9.56496148,  7.85476856,  8.15746101, 14.09041434, 10.36096284,
        9.17911023,  6.96177506, 11.24547773])