In [1]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.random import RandomRDDs

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col,udf,when,regexp_extract
from pyspark.sql import Window


from pyspark.sql.types import StringType,IntegerType

from urllib.parse import urlparse
import gc
import os
import pandas as pd
import numpy as np
import time

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


conf = SparkConf().setAppName('clickstream')\
    .set('spark.driver.memoryOverhead', '2g') \
    .set('spark.executor.memory', '12g') \
    .set("spark.executor.instances", "8") \
    .set('spark.executor.cores', '1') \
    .set("spark.sql.caseSensitive", "true")
#     .set('spark.driver.memory', '10g') \
#     .set('spark.dynamicAllocation.enabled', 'true') \

spark = SparkSession.builder.appName('clickstream').config(conf=conf).getOrCreate()

# spark = SparkSession.builder.config("spark.driver.memory","12g").config('spark.driver.memoryOverhead', '4g').config("spark.driver.cores","6").appName("Charge Model").getOrCreate()


# spark.conf.set("spark.sql.caseSensitive", "true")

In [12]:
t0 = time.time()
dg = spark.read.json(r'C:\Users\N716307\Downloads\TweetCount-master\2016\09\2*\*\*\*\*\*')
dg.registerTempTable("dg")


df = sqlContext.sql("""
    SELECT request.*,request.requestHeaders.*,request.responseHeaders.*,server_request.*,
    request.requestHeaders.`Content-Type` as requestHeaders_content_type,request.responseHeaders.`Content-Type` as responseHeader_content_type,
    request.requestHeaders.`Upgrade-Insecure-Requests` as requestHeaders_Upgrade_Insecure_Requests,request.responseHeaders.`Upgrade-Insecure-Requests` as responseHeader_Upgrade_Insecure_Requests,
    request.requestHeaders.Origin as requestHeaders_Origin,
    request.requestHeaders.Referer as requestHeaders_Referer,
    request.requestHeaders.`User-Agent` as requestHeaders_User_Agent
    FROM dg
""").drop('requestHeaders'
         ).drop('responseHeaders'
               ).drop('server_request'
                     ).drop('Content-Type'
                                 ).drop('Origin'
                                       ).drop('Referer'
                                             ).drop('User-Agent'
                                                   )

print(len(df.columns),df.count())

t1 = time.time()
print('Command took {:.2f} seconds'.format(t1-t0))

del dg
gc.collect()

In [13]:
dg.printSchema()

root
 |-- request: struct (nullable = true)
 |    |-- documentReferer: string (nullable = true)
 |    |-- error: string (nullable = true)
 |    |-- eventId: string (nullable = true)
 |    |-- frameId: long (nullable = true)
 |    |-- fromCache: boolean (nullable = true)
 |    |-- ip: string (nullable = true)
 |    |-- is_online: boolean (nullable = true)
 |    |-- mainFrameRequestId: string (nullable = true)
 |    |-- method: string (nullable = true)
 |    |-- navigationId: long (nullable = true)
 |    |-- openerTabId: long (nullable = true)
 |    |-- parentFrameId: long (nullable = true)
 |    |-- redirectUrl: string (nullable = true)
 |    |-- requestHeaders: struct (nullable = true)
 |    |    |-- Accept: string (nullable = true)
 |    |    |-- Accept-Encoding: string (nullable = true)
 |    |    |-- Accept-Language: string (nullable = true)
 |    |    |-- Allow-Chrome-SignIn: string (nullable = true)
 |    |    |-- Avail-Dictionary: string (nullable = true)
 |    |    |-- Cache-Con

In [None]:
df.select('X-Amazon-Internal-Page-Type').filter((df['X-Amazon-Internal-Page-Type'].isNotNull())& (df['X-Amazon-Internal-Page-Type']!='').show()

+---------------------------+
|X-Amazon-Internal-Page-Type|
+---------------------------+
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
|                           |
+---------------------------+
only showing top 20 rows



In [None]:
df.select('documentReferer','redirectUrl','X-Amazon-Internal-Page-Type','requestHeaders_Referer','ip','request_unixtime').filter(df['X-Amazon-Internal-Page-Type'].isNotNull()).show(200)

In [4]:
w = Window.partitionBy('ip')
df = df.withColumn('ip_count',F.count(df.ip).over(w))

In [5]:
def ip_to_zip(ip):
    from geolite2 import geolite2
    if ip == None: return None
    reader = geolite2.reader()
    ip_location = reader.get(ip)
    if ip_location and 'postal' in ip_location.keys():
        try:
            return int(ip_location['postal']['code'])
        except:
            return None
    else:
        return None
    

In [6]:
ip_to_zip_udf = udf(ip_to_zip,IntegerType())
df = df.withColumn('zipcode',when(df.ip.isNotNull(),ip_to_zip_udf(df.ip)).otherwise(df.ip))


In [7]:
df = df.withColumn('requestHeaders_User_Agent_platform', regexp_extract(col('requestHeaders_User_Agent'), r"\((\w+)([^\)]+)\)",1))
df = df.withColumn('google-accounts-signin', regexp_extract(col('google-accounts-signin'), r"(email=\")([-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))",2))


In [8]:
df = df.withColumn("request_unixtime",F.to_timestamp(df["request_unixtime"]))
df = df.withColumn("timeStamp",F.to_timestamp(df["timeStamp"]))

In [35]:
# Link
df = dg.withColumn('Link_rel', regexp_extract(col('Link'), '(.)(rel=)(\w+)',3))

# response header content type
df = df.withColumn('responseHeader_content_type', regexp_extract(col('responseHeader_content_type'), r"(.*?);",1))

# user agent
df = df.withColumn('requestHeaders_User_Agent', regexp_extract(col('requestHeaders_User_Agent'), r"\((\w+)([^\)]+)\)",1))


In [36]:
for c in ['documentReferer','redirectUrl','url','requestHeaders_Referer','Link']:
    df = df.withColumn(c,regexp_extract(col(c), r"(www\.)?([-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*?))",2))

In [31]:
c = dg.count()
null_drop = []
distinct_high = []
col_number = 0
for X,y in dg.dtypes:
    col_number += 1
    x = '`' + X + '`'
    null_count = dg.filter((col(x).isNull()) | (col(x) == '')).count()
    if y == 'string':
        distinct_count = dg.select(x).distinct().count()
        if distinct_count > 100:
            distinct_high.append(x)
    else:
        distinct_count = 0
    if null_count*1./c > 0.2:
        null_drop.append(x)
    print('{:3d} {:40s} {:8d} {:6.2f}%, {:8d}'.format(col_number,X,null_count,null_count*100./c,distinct_count))

  1 documentReferer                              1416  21.07%,      426
  2 error                                        6683  99.45%,        2
  3 eventId                                         0   0.00%,     6565
  4 frameId                                         0   0.00%,        0
  5 fromCache                                       0   0.00%,        0
  6 ip                                            395   5.88%,     1004
  7 is_online                                    6683  99.45%,        0
  8 method                                          0   0.00%,        2
  9 navigationId                                    3   0.04%,        0
 10 openerTabId                                  6714  99.91%,        0
 11 parentFrameId                                   0   0.00%,        0
 12 redirectUrl                                  5824  86.67%,      557
 13 requestType                                     0   0.00%,        1
 14 statusCode                                     37   0.55%,  

KeyboardInterrupt: 

In [267]:
keep_cols = []
for x in dg.columns:
    x = '`' + x + '`'
    if x not in null_drop:
        keep_cols.append(x)
        
dg = dg.select(keep_cols)

In [225]:
dg = dg.withColumn("TimestampType",F.to_timestamp(dg["request_unixtime"]))

In [58]:
def rdd_map_t_row(one_row,model_lr):

    # one_row is of type lines type=<class 'pyspark.sql.types.Row'>
    row_dict = one_row.asDict()
    pd_df = pd.DataFrame([row_dict]) # has to be a list object
    print("ONE:ROW:now with PANDAS DF={}".format(pd_df))
    try:
        y_pred = model_lr.predict(pd_df)
        print("ONE:ROW:PD_DF_Y_PRED={}".format(y_pred))
    except:
        print("error:MODEL:PREDICT")

    ret_row = one_row # access the elements like one_row.col1, one_row.col2  where col1 is the name of the column
#     try:
#         # cannto do spark.createDataFrame(pd_df)  because we have to return the rdd.row object
#         ret_pd_dict = pd_df.to_dict(orient='records')
#         print("ONE:ROW:PD:DICT=type={0}::dict={1}".format(type(ret_pd_dict),ret_pd_dict))
#         ret_pd_dict = ret_pd_dict[1] # get the SECOND because first has header --first and the only element we need since we create 1 row
#         ret_row = Row(**ret_pd_dict)
#     except:
#         print("error:SPARK:PD:TO:DF")
    ret_pd_dict = pd_df.to_dict(orient='records')
    print("ONE:ROW:PD:DICT=type={0}::dict={1}".format(type(ret_pd_dict),ret_pd_dict))
    print(ret_pd_dict)
    ret_pd_dict = ret_pd_dict[0] # get the SECOND because first has header --first and the only element we need since we create 1 row
#     ret_row = Row(**ret_pd_dict)
    return ret_row

In [47]:
new_rdd = df.rdd.map(lambda x: rdd_map_t_row(x,model_lr))

In [59]:
x = rdd_map_t_row(df.rdd.first(),model_lr)

ONE:ROW:now with PANDAS DF=         _1        _2        _3        _4
0  0.851571  0.124906  0.667121  0.354662
ONE:ROW:PD_DF_Y_PRED=[9.56496148]
ONE:ROW:PD:DICT=type=<class 'list'>::dict=[{'_1': 0.8515712526498084, '_2': 0.12490555818176752, '_3': 0.6671208952627372, '_4': 0.35466243926679875}]
[{'_1': 0.8515712526498084, '_2': 0.12490555818176752, '_3': 0.6671208952627372, '_4': 0.35466243926679875}]


In [60]:
x

Row(_1=0.8515712526498084, _2=0.12490555818176752, _3=0.6671208952627372, _4=0.35466243926679875)

In [49]:
new_rdd.first()

Row(_1=0.8515712526498084, _2=0.12490555818176752, _3=0.6671208952627372, _4=0.35466243926679875)

In [42]:
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1,1,1], [1, 2,2,3], [2, 2,4,5], [2, 3,6,7]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2,5,6])) + 3
model_lr = LinearRegression().fit(X, y)

In [43]:
model_lr.predict(pandas_df)

array([ 9.56496148,  7.85476856,  8.15746101, 14.09041434, 10.36096284,
        9.17911023,  6.96177506, 11.24547773])