## set up

In [6]:
evaluation = True
evaluation_verbose = False

OUTPUT_BUCKET_FOLDER = "gs://capstone-02/output/"
DATA_BUCKET_FOLDER = "gs://capstone-02/data/"

In [3]:
from IPython.display import display

In [4]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, SparseVector, VectorUDT

In [1]:
import numpy as np
import scipy.sparse

In [2]:
import math
import datetime
import time
import itertools

In [5]:
import pickle

In [6]:
import random
random.seed(42)

In [7]:
import pandas as pd
%matplotlib inline

In [8]:
start_time = time.time()

In [9]:
# 해시 함수를 가진 모듈 hashlib
import hashlib
def hashstr(s, nr_bins):
    return int(hashlib.md5(s.encode('utf8')).hexdigest(), 16)%(nr_bins-1)+1

## train_feature_vectors_exported_df

In [10]:
# train_feature_vectors 불러오기
train_feature_vectors_exported_df = spark.read.parquet("gs://capstone-01/output/train_feature_vectors_integral_eval")
%time train_feature_vectors_exported_df.take(3)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 11.9 s


[Row(uuid=u'10005a0add15f6', display_id=5686397, ad_id=61941, document_id_event=2135921, document_id=1017869, label=1, feature_vector=SparseVector(103, {0: 1.0, 3: 11370.0, 4: 15083.0, 5: 0.0, 6: 3.0, 7: 173.0, 8: 0.4069, 9: 0.5623, 10: 0.2288, 11: 0.3902, 12: 0.5131, 13: 0.2002, 14: 0.2691, 15: 0.3531, 16: 0.095, 17: 0.3902, 18: 0.5131, 19: 0.2002, 20: 0.3902, 21: 0.5131, 22: 0.2002, 26: 0.3902, 27: 0.5131, 28: 0.2002, 29: 0.3902, 30: 0.5131, 31: 0.2002, 32: 0.3826, 33: 0.2833, 34: 0.1084, 35: 0.3962, 36: 0.2833, 37: 0.1122, 38: 0.2738, 39: 0.0012, 40: 0.0003, 41: 0.2801, 42: 0.0012, 43: 0.0003, 44: 0.2298, 45: 0.0622, 46: 0.0143, 47: 0.2336, 48: 0.0595, 49: 0.0139, 59: 0.0, 60: 0.0004, 61: 0.0, 62: 0.0, 63: 0.0, 64: 0.0, 68: 2413.0, 69: 1403.0, 70: 1610.0, 72: 108.0, 75: 194.0, 76: 15.0, 81: 440.0, 82: 4016.0, 83: 1702.0, 84: 1707.0, 86: 137.0, 95: 723.0, 96: 4194.0, 97: 18595452.0, 98: 745661.0, 99: 33260.0, 100: 3.0, 101: 2.0, 102: 1.0})),
 Row(uuid=u'10005a0add15f6', display_id=56

## Feature Vector

feature vector header 만들기

In [11]:
bool_feature_names = ['event_weekend',
                      'user_has_already_viewed_doc']

In [12]:
int_feature_names = ['user_views',
                    'ad_views',
                    'doc_views',
                    'doc_event_days_since_published',
                    'doc_event_hour',
                    'doc_ad_days_since_published', 
                    ]

In [13]:
float_feature_names = [                                
                'pop_ad_id',       
                'pop_ad_id_conf',   
                'pop_ad_id_conf_multipl', 
                'pop_document_id',                
                'pop_document_id_conf',
                'pop_document_id_conf_multipl',
                'pop_publisher_id',
                'pop_publisher_id_conf',
                'pop_publisher_id_conf_multipl',
                'pop_advertiser_id',
                'pop_advertiser_id_conf',
                'pop_advertiser_id_conf_multipl',
                'pop_campain_id',
                'pop_campain_id_conf',
                'pop_campain_id_conf_multipl',
                'pop_doc_event_doc_ad',
                'pop_doc_event_doc_ad_conf',
                'pop_doc_event_doc_ad_conf_multipl',
                'pop_source_id',  
                'pop_source_id_conf',
                'pop_source_id_conf_multipl',
                'pop_source_id_country',
                'pop_source_id_country_conf',
                'pop_source_id_country_conf_multipl',
                'pop_entity_id',    
                'pop_entity_id_conf',
                'pop_entity_id_conf_multipl',
                'pop_entity_id_country',
                'pop_entity_id_country_conf',
                'pop_entity_id_country_conf_multipl',
                'pop_topic_id', 
                'pop_topic_id_conf',
                'pop_topic_id_conf_multipl',
                'pop_topic_id_country',
                'pop_topic_id_country_conf',
                'pop_topic_id_country_conf_multipl',
                'pop_category_id', 
                'pop_category_id_conf',
                'pop_category_id_conf_multipl',
                'pop_category_id_country',
                'pop_category_id_country_conf',
                'pop_category_id_country_conf_multipl',
                'user_doc_ad_sim_categories',    
                'user_doc_ad_sim_categories_conf',
                'user_doc_ad_sim_categories_conf_multipl',
                'user_doc_ad_sim_topics',    
                'user_doc_ad_sim_topics_conf',
                'user_doc_ad_sim_topics_conf_multipl',
                'user_doc_ad_sim_entities',                    
                'user_doc_ad_sim_entities_conf',
                'user_doc_ad_sim_entities_conf_multipl',
                'doc_event_doc_ad_sim_categories',    
                'doc_event_doc_ad_sim_categories_conf',
                'doc_event_doc_ad_sim_categories_conf_multipl',
                'doc_event_doc_ad_sim_topics',    
                'doc_event_doc_ad_sim_topics_conf',
                'doc_event_doc_ad_sim_topics_conf_multipl',
                'doc_event_doc_ad_sim_entities',                    
                'doc_event_doc_ad_sim_entities_conf',
                'doc_event_doc_ad_sim_entities_conf_multipl'
               ]

In [14]:
TRAFFIC_SOURCE_FV='traffic_source'
EVENT_HOUR_FV='event_hour'
EVENT_COUNTRY_FV = 'event_country'
EVENT_COUNTRY_STATE_FV = 'event_country_state'
EVENT_GEO_LOCATION_FV = 'event_geo_location'
EVENT_PLATFORM_FV = 'event_platform'
AD_ADVERTISER_FV = 'ad_advertiser'
DOC_AD_SOURCE_ID_FV='doc_ad_source_id'
DOC_AD_PUBLISHER_ID_FV='doc_ad_publisher_id'
DOC_EVENT_SOURCE_ID_FV='doc_event_source_id'
DOC_EVENT_PUBLISHER_ID_FV='doc_event_publisher_id'
DOC_AD_CATEGORY_ID_FV='doc_ad_category_id'
DOC_AD_TOPIC_ID_FV='doc_ad_topic_id'
DOC_AD_ENTITY_ID_FV='doc_ad_entity_id'
DOC_EVENT_CATEGORY_ID_FV='doc_event_category_id'
DOC_EVENT_TOPIC_ID_FV='doc_event_topic_id'
DOC_EVENT_ENTITY_ID_FV='doc_event_entity_id'

In [15]:
category_feature_names_integral = ['ad_advertiser',
 'doc_ad_category_id_1',
 'doc_ad_category_id_2',
 'doc_ad_category_id_3',
 'doc_ad_topic_id_1',
 'doc_ad_topic_id_2',
 'doc_ad_topic_id_3',
 'doc_ad_entity_id_1', 
 'doc_ad_entity_id_2', 
 'doc_ad_entity_id_3', 
 'doc_ad_entity_id_4', 
 'doc_ad_entity_id_5', 
 'doc_ad_entity_id_6', 
 'doc_ad_publisher_id',
 'doc_ad_source_id', 
 'doc_event_category_id_1',
 'doc_event_category_id_2',
 'doc_event_category_id_3',
 'doc_event_topic_id_1',
 'doc_event_topic_id_2',
 'doc_event_topic_id_3',
 'doc_event_entity_id_1',
 'doc_event_entity_id_2',
 'doc_event_entity_id_3',
 'doc_event_entity_id_4',
 'doc_event_entity_id_5',
 'doc_event_entity_id_6',
 'doc_event_publisher_id',
 'doc_event_source_id', 
 'event_country',
 'event_country_state',
 'event_geo_location',
 'event_hour',
 'event_platform',
 'traffic_source']

In [16]:
feature_vector_labels_integral = bool_feature_names + int_feature_names + float_feature_names + \
                                 category_feature_names_integral

In [17]:
integral_headers = ['label', 'display_id', 'ad_id', 'doc_id', 'doc_event_id', 'is_leak'] + feature_vector_labels_integral
    
with open("train_feature_vectors_integral_eval.csv"+".header", 'w') as output:
    output.writelines('\n'.join(integral_headers))

In [18]:
len(feature_vector_labels_integral)

103

In [19]:
def sparse_vector_to_csv_with_nulls_row(additional_column_values, vec, num_columns):    
    return ','.join([str(value) for value in additional_column_values] + 
                     list([ '{:.5}'.format(vec[x]) if x in vec.indices else '' for x in range(vec.size) ])[:num_columns]) \
            .replace('.0,',',')

## train_feature_vectors_integral_csv_rdd 만들기

In [61]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)) \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [21]:
%time train_feature_vectors_integral_csv_rdd.take(3)

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 4.23 s


['1,5686397,61941,1017869,2135921,-1,1,,,1.137e+04,1.5083e+04,0,3,173,0.40695,0.56225,0.22881,0.39017,0.51313,0.20021,0.2691,0.35314,0.095031,0.39017,0.51313,0.20021,0.39017,0.51313,0.20021,,,,0.39017,0.51313,0.20021,0.39017,0.51313,0.20021,0.38264,0.28331,0.10841,0.39616,0.28331,0.11224,0.27381,0.0011525,0.00031556,0.28009,0.0011638,0.00032598,0.22983,0.0622,0.014295,0.23363,0.059455,0.01389,,,,,,,,,,0,0.00042512,0,0,1.1111e-05,0,,,,2413,1403,1610,,108,,,194,15,,,,,440,4016,1702,1707,,137,,,,,,,,,723,4194,1.8595e+07,7.4566e+05,3.326e+04,3,2,1.0',
 '0,5686397,174547,1439845,2135921,-1,1,,,8.7111e+04,1.0612e+05,0,3,124,0.091332,0.68484,0.062548,0.083101,0.49831,0.04141,,,,0.045525,0.41999,0.01912,0.086491,0.54912,0.047494,,,,0.053356,0.44458,0.023721,0.056284,0.45694,0.025718,0.093465,0.075913,0.0070952,0.096561,0.077794,0.0075119,0.16458,0.00021974,3.6165e-05,0.16291,0.00023781,3.8742e-05,0.20646,0.13393,0.027652,0.21204,0.13264,0.028124,,,,,,,,,,0,0.00042512,0,0,1.1111e-05,0,,,,2151,1

In [33]:
%time train_feature_vectors_integral_csv_rdd.map(lambda x: len(x)).collect()

CPU times: user 4.37 s, sys: 1.61 s, total: 5.98 s
Wall time: 24min 27s


[549,
 547,
 475,
 481,
 543,
 530,
 609,
 603,
 642,
 541,
 538,
 593,
 602,
 521,
 560,
 563,
 484,
 579,
 486,
 562,
 568,
 544,
 560,
 558,
 559,
 564,
 546,
 586,
 559,
 564,
 562,
 598,
 559,
 560,
 498,
 554,
 472,
 550,
 582,
 581,
 593,
 615,
 565,
 600,
 580,
 595,
 504,
 570,
 528,
 615,
 583,
 516,
 532,
 515,
 526,
 534,
 519,
 518,
 538,
 511,
 554,
 538,
 574,
 553,
 510,
 530,
 547,
 475,
 550,
 563,
 551,
 529,
 573,
 578,
 524,
 560,
 546,
 544,
 498,
 574,
 495,
 547,
 568,
 560,
 560,
 542,
 591,
 559,
 578,
 564,
 529,
 553,
 569,
 601,
 567,
 510,
 561,
 559,
 556,
 569,
 587,
 553,
 603,
 560,
 556,
 568,
 593,
 595,
 596,
 563,
 578,
 568,
 569,
 581,
 578,
 550,
 558,
 568,
 579,
 537,
 563,
 567,
 548,
 588,
 520,
 573,
 607,
 554,
 576,
 578,
 551,
 541,
 509,
 514,
 531,
 598,
 584,
 520,
 566,
 516,
 565,
 584,
 555,
 574,
 581,
 454,
 542,
 540,
 523,
 596,
 641,
 593,
 635,
 582,
 587,
 611,
 562,
 582,
 598,
 574,
 573,
 548,
 498,
 498,
 547,
 497,
 525

## just for rdd.show()

단순히 rdd.show()를 위해 모든 필드의 type을 string으로 셋팅

In [21]:
train_schema = StructType(
                    [StructField("label", StringType(), True),
                    StructField("display_id", StringType(), True),                    
                    StructField("ad_id", StringType(), True),
                    StructField("doc_id", StringType(), True),
                    StructField("doc_event_id", StringType(), True),                    
                    StructField("is_leak", StringType(), True),
                    StructField("event_weekend", StringType(), True),
                    StructField("user_has_already_viewed_doc", StringType(), True),                    
                    StructField("user_views", StringType(), True),
                    StructField("ad_views", StringType(), True),
                    StructField("doc_views", StringType(), True),                    
                    StructField("doc_event_days_since_published", StringType(), True),
                    StructField("doc_event_hour", StringType(), True),
                    StructField("doc_ad_days_since_published", StringType(), True),                    
                    StructField("pop_ad_id", StringType(), True),
                    StructField("pop_ad_id_conf", StringType(), True),
                    StructField("pop_ad_id_conf_multipl", StringType(), True),                    
                    StructField("pop_document_id", StringType(), True),
                    StructField("pop_document_id_conf", StringType(), True),
                    StructField("pop_document_id_conf_multipl", StringType(), True),                   
                    StructField("pop_publisher_id", StringType(), True),
                    StructField("pop_publisher_id_conf", StringType(), True),
                    StructField("pop_publisher_id_conf_multipl", StringType(), True),                    
                    StructField("pop_advertiser_id", StringType(), True),
                    StructField("pop_advertiser_id_conf", StringType(), True),
                    StructField("pop_advertiser_id_conf_multipl", StringType(), True),                    
                    StructField("pop_campain_id", StringType(), True),
                    StructField("pop_campain_id_conf", StringType(), True),
                    StructField("pop_campain_id_conf_multipl", StringType(), True),                    
                    StructField("pop_doc_event_doc_ad", StringType(), True), 
                    StructField("pop_doc_event_doc_ad_conf", StringType(), True),
                    StructField("pop_doc_event_doc_ad_conf_multipl", StringType(), True),
                    StructField("pop_source_id", StringType(), True),                    
                    StructField("pop_source_id_conf", StringType(), True),
                    StructField("pop_source_id_conf_multipl", StringType(), True),
                    StructField("pop_source_id_country", StringType(), True),
                    StructField("pop_source_id_country_conf", StringType(), True),                    
                    StructField("pop_source_id_country_conf_multipl", StringType(), True),
                    StructField("pop_entity_id", StringType(), True),
                    StructField("pop_entity_id_conf", StringType(), True),                
                    StructField("pop_entity_id_conf_multipl", StringType(), True),
                    StructField("pop_entity_id_country", StringType(), True),
                    StructField("pop_entity_id_country_conf", StringType(), True),
                    StructField("pop_entity_id_country_conf_multipl", StringType(), True),
                    StructField("pop_topic_id", StringType(), True),
                    StructField("pop_topic_id_conf", StringType(), True),                    
                    StructField("pop_topic_id_conf_multipl", StringType(), True),
                    StructField("pop_topic_id_country", StringType(), True),
                    StructField("pop_topic_id_country_conf", StringType(), True),                    
                    StructField("pop_topic_id_country_conf_multipl", StringType(), True),   
                    StructField("pop_category_id", StringType(), True),
                    StructField("pop_category_id_conf", StringType(), True),
                    StructField("pop_category_id_conf_multipl", StringType(), True),
                    StructField("pop_category_id_country", StringType(), True),
                    StructField("pop_category_id_country_conf", StringType(), True),                    
                    StructField("pop_category_id_country_conf_multipl", StringType(), True),
                    StructField("user_doc_ad_sim_categories", StringType(), True),
                    StructField("user_doc_ad_sim_categories_conf", StringType(), True),                    
                    StructField("user_doc_ad_sim_categories_conf_multipl", StringType(), True),
                    StructField("user_doc_ad_sim_topics", StringType(), True),  
                    StructField("user_doc_ad_sim_topics_conf", StringType(), True),
                    StructField("user_doc_ad_sim_topics_conf_multipl", StringType(), True),
                    StructField("user_doc_ad_sim_entities", StringType(), True),
                    StructField("user_doc_ad_sim_entities_conf", StringType(), True),                    
                    StructField("user_doc_ad_sim_entities_conf_multipl", StringType(), True),
                    StructField("doc_event_doc_ad_sim_categories", StringType(), True),
                    StructField("doc_event_doc_ad_sim_categories_conf", StringType(), True),                    
                    StructField("doc_event_doc_ad_sim_categories_conf_multipl", StringType(), True),
                    StructField("doc_event_doc_ad_sim_topics", StringType(), True),
                    StructField("doc_event_doc_ad_sim_topics_conf", StringType(), True),   
                    StructField("doc_event_doc_ad_sim_topics_conf_multipl", StringType(), True),
                    StructField("doc_event_doc_ad_sim_entities", StringType(), True),
                    StructField("doc_event_doc_ad_sim_entities_conf", StringType(), True),                    
                    StructField("doc_event_doc_ad_sim_entities_conf_multipl", StringType(), True),
                    StructField("ad_advertiser", StringType(), True),
                    StructField("doc_ad_category_id_1", StringType(), True),                    
                    StructField("doc_ad_category_id_2", StringType(), True),
                    StructField("doc_ad_category_id_3", StringType(), True),
                    StructField("doc_ad_topic_id_1", StringType(), True),
                    StructField("doc_ad_topic_id_2", StringType(), True),   
                    StructField("doc_ad_topic_id_3", StringType(), True),
                    StructField("doc_ad_entity_id_1", StringType(), True),                    
                    StructField("doc_ad_entity_id_2", StringType(), True),
                    StructField("doc_ad_entity_id_3", StringType(), True),
                    StructField("doc_ad_entity_id_4", StringType(), True),                    
                    StructField("doc_ad_entity_id_5", StringType(), True),
                    StructField("doc_ad_entity_id_6", StringType(), True),
                    StructField("doc_ad_publisher_id", StringType(), True),
                    StructField("doc_ad_source_id", StringType(), True),
                    StructField("doc_event_category_id_1", StringType(), True),  
                    StructField("doc_event_category_id_2", StringType(), True),                    
                    StructField("doc_event_category_id_3", StringType(), True),
                    StructField("doc_event_topic_id_1", StringType(), True),
                    StructField("doc_event_topic_id_2", StringType(), True),                    
                    StructField("doc_event_topic_id_3", StringType(), True),
                    StructField("doc_event_entity_id_1", StringType(), True),
                    StructField("doc_event_entity_id_2", StringType(), True),
                    StructField("doc_event_entity_id_3", StringType(), True),
                    StructField("doc_event_entity_id_4", StringType(), True),
                    StructField("doc_event_entity_id_5", StringType(), True),           
                    StructField("doc_event_entity_id_6", StringType(), True),
                    StructField("doc_event_publisher_id", StringType(), True),
                    StructField("doc_event_source_id", StringType(), True),                    
                    StructField("event_country", StringType(), True),
                    StructField("event_country_state", StringType(), True),
                    StructField("event_geo_location", StringType(), True),
                    StructField("event_hour", StringType(), True),
                    StructField("event_platform", StringType(), True),
                    StructField("traffic_source", StringType(), True)]
                    )

train_feature_vectors_integral_csv_rdd_str = train_feature_vectors_integral_csv_rdd.map(lambda x : x.split(","))

In [47]:
train_feature_vectors_integral_csv_rdd_str.take(1)

[['1',
  '5686397',
  '61941',
  '1017869',
  '2135921',
  '-1',
  '1',
  '',
  '',
  '1.137e+04',
  '1.5083e+04',
  '0',
  '3',
  '173',
  '0.40695',
  '0.56225',
  '0.22881',
  '0.39017',
  '0.51313',
  '0.20021',
  '0.2691',
  '0.35314',
  '0.095031',
  '0.39017',
  '0.51313',
  '0.20021',
  '0.39017',
  '0.51313',
  '0.20021',
  '',
  '',
  '',
  '0.39017',
  '0.51313',
  '0.20021',
  '0.39017',
  '0.51313',
  '0.20021',
  '0.38264',
  '0.28331',
  '0.10841',
  '0.39616',
  '0.28331',
  '0.11224',
  '0.27381',
  '0.0011525',
  '0.00031556',
  '0.28009',
  '0.0011638',
  '0.00032598',
  '0.22983',
  '0.0622',
  '0.014295',
  '0.23363',
  '0.059455',
  '0.01389',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '',
  '0',
  '0.00042512',
  '0',
  '0',
  '1.1111e-05',
  '0',
  '',
  '',
  '',
  '2413',
  '1403',
  '1610',
  '',
  '108',
  '',
  '',
  '194',
  '15',
  '',
  '',
  '',
  '',
  '440',
  '4016',
  '1702',
  '1707',
  '',
  '137',
  '',
  '',
  '',
  '',
  '',
  '',
  '',


In [22]:
SparkS = SparkSession.builder.getOrCreate()
train_df_str = SparkS.createDataFrame(train_feature_vectors_integral_csv_rdd_str, train_schema)

In [None]:
# 모든 필드를 string으로 선언하면, rdd.show() 했을 때 에러가 안 나고 잘 보인다
train_df_str.show()

## real rdd

모든 필드를 string이 아닌 원래 type으로 셋팅한다

In [51]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_exported_df.select(
     'label', 'display_id', 'ad_id', 'document_id', 'document_id_event', 'feature_vector').withColumn('is_leak', F.lit(-1)) \
     .rdd.map(lambda x: sparse_vector_to_csv_with_nulls_row([x['label'], x['display_id'], x['ad_id'], x['document_id'], x['document_id_event'], x['is_leak']], 
                                                  x['feature_vector'], len(integral_headers)))

In [52]:
train_feature_vectors_integral_csv_rdd = train_feature_vectors_integral_csv_rdd.map(lambda x : x.split(","))

train_schema = StructType(
                    [StructField("label", IntegerType(), True),
                    StructField("display_id", IntegerType(), True),                    
                    StructField("ad_id", IntegerType(), True),
                    StructField("doc_id", IntegerType(), True),
                    StructField("doc_event_id", IntegerType(), True),                    
                    StructField("is_leak", IntegerType(), True),
                    StructField("event_weekend", IntegerType(), True),
                    StructField("user_has_already_viewed_doc", IntegerType(), True),                    
                    StructField("user_views", IntegerType(), True),
                    StructField("ad_views", IntegerType(), True),
                    StructField("doc_views", IntegerType(), True),                    
                    StructField("doc_event_days_since_published", IntegerType(), True),
                    StructField("doc_event_hour", IntegerType(), True),
                    StructField("doc_ad_days_since_published", IntegerType(), True),                    
                    StructField("pop_ad_id", FloatType(), True),
                    StructField("pop_ad_id_conf", FloatType(), True),
                    StructField("pop_ad_id_conf_multipl", FloatType(), True),                    
                    StructField("pop_document_id", FloatType(), True),
                    StructField("pop_document_id_conf", FloatType(), True),
                    StructField("pop_document_id_conf_multipl", FloatType(), True),                   
                    StructField("pop_publisher_id", FloatType(), True),
                    StructField("pop_publisher_id_conf", FloatType(), True),
                    StructField("pop_publisher_id_conf_multipl", FloatType(), True),                    
                    StructField("pop_advertiser_id", FloatType(), True),
                    StructField("pop_advertiser_id_conf", FloatType(), True),
                    StructField("pop_advertiser_id_conf_multipl", FloatType(), True),                    
                    StructField("pop_campain_id", FloatType(), True),
                    StructField("pop_campain_id_conf", FloatType(), True),
                    StructField("pop_campain_id_conf_multipl", FloatType(), True),                    
                    StructField("pop_doc_event_doc_ad", FloatType(), True), 
                    StructField("pop_doc_event_doc_ad_conf", FloatType(), True),
                    StructField("pop_doc_event_doc_ad_conf_multipl", FloatType(), True),
                    StructField("pop_source_id", FloatType(), True),                    
                    StructField("pop_source_id_conf", FloatType(), True),
                    StructField("pop_source_id_conf_multipl", FloatType(), True),
                    StructField("pop_source_id_country", FloatType(), True),
                    StructField("pop_source_id_country_conf", FloatType(), True),                    
                    StructField("pop_source_id_country_conf_multipl", FloatType(), True),
                    StructField("pop_entity_id", FloatType(), True),
                    StructField("pop_entity_id_conf", FloatType(), True),                
                    StructField("pop_entity_id_conf_multipl", FloatType(), True),
                    StructField("pop_entity_id_country", FloatType(), True),
                    StructField("pop_entity_id_country_conf", FloatType(), True),
                    StructField("pop_entity_id_country_conf_multipl", FloatType(), True),
                    StructField("pop_topic_id", FloatType(), True),
                    StructField("pop_topic_id_conf", FloatType(), True),                    
                    StructField("pop_topic_id_conf_multipl", FloatType(), True),
                    StructField("pop_topic_id_country", FloatType(), True),
                    StructField("pop_topic_id_country_conf", FloatType(), True),                    
                    StructField("pop_topic_id_country_conf_multipl", FloatType(), True),   
                    StructField("pop_category_id", FloatType(), True),
                    StructField("pop_category_id_conf", FloatType(), True),
                    StructField("pop_category_id_conf_multipl", FloatType(), True),
                    StructField("pop_category_id_country", FloatType(), True),
                    StructField("pop_category_id_country_conf", FloatType(), True),                    
                    StructField("pop_category_id_country_conf_multipl", FloatType(), True),
                    StructField("user_doc_ad_sim_categories", FloatType(), True),
                    StructField("user_doc_ad_sim_categories_conf", FloatType(), True),                    
                    StructField("user_doc_ad_sim_categories_conf_multipl", FloatType(), True),
                    StructField("user_doc_ad_sim_topics", FloatType(), True),  
                    StructField("user_doc_ad_sim_topics_conf", FloatType(), True),
                    StructField("user_doc_ad_sim_topics_conf_multipl", FloatType(), True),
                    StructField("user_doc_ad_sim_entities", FloatType(), True),
                    StructField("user_doc_ad_sim_entities_conf", FloatType(), True),                    
                    StructField("user_doc_ad_sim_entities_conf_multipl", FloatType(), True),
                    StructField("doc_event_doc_ad_sim_categories", FloatType(), True),
                    StructField("doc_event_doc_ad_sim_categories_conf", FloatType(), True),                    
                    StructField("doc_event_doc_ad_sim_categories_conf_multipl", FloatType(), True),
                    StructField("doc_event_doc_ad_sim_topics", FloatType(), True),
                    StructField("doc_event_doc_ad_sim_topics_conf", FloatType(), True),   
                    StructField("doc_event_doc_ad_sim_topics_conf_multipl", FloatType(), True),
                    StructField("doc_event_doc_ad_sim_entities", FloatType(), True),
                    StructField("doc_event_doc_ad_sim_entities_conf", FloatType(), True),                    
                    StructField("doc_event_doc_ad_sim_entities_conf_multipl", FloatType(), True),
                    StructField("ad_advertiser", IntegerType(), True),
                    StructField("doc_ad_category_id_1", IntegerType(), True),                    
                    StructField("doc_ad_category_id_2", IntegerType(), True),
                    StructField("doc_ad_category_id_3", IntegerType(), True),
                    StructField("doc_ad_topic_id_1", IntegerType(), True),
                    StructField("doc_ad_topic_id_2", IntegerType(), True),   
                    StructField("doc_ad_topic_id_3", IntegerType(), True),
                    StructField("doc_ad_entity_id_1", IntegerType(), True),                    
                    StructField("doc_ad_entity_id_2", IntegerType(), True),
                    StructField("doc_ad_entity_id_3", IntegerType(), True),
                    StructField("doc_ad_entity_id_4", IntegerType(), True),                    
                    StructField("doc_ad_entity_id_5", IntegerType(), True),
                    StructField("doc_ad_entity_id_6", IntegerType(), True),
                    StructField("doc_ad_publisher_id", IntegerType(), True),
                    StructField("doc_ad_source_id", IntegerType(), True),
                    StructField("doc_event_category_id_1", IntegerType(), True),  
                    StructField("doc_event_category_id_2", IntegerType(), True),                    
                    StructField("doc_event_category_id_3", IntegerType(), True),
                    StructField("doc_event_topic_id_1", IntegerType(), True),
                    StructField("doc_event_topic_id_2", IntegerType(), True),                    
                    StructField("doc_event_topic_id_3", IntegerType(), True),
                    StructField("doc_event_entity_id_1", IntegerType(), True),
                    StructField("doc_event_entity_id_2", IntegerType(), True),
                    StructField("doc_event_entity_id_3", IntegerType(), True),
                    StructField("doc_event_entity_id_4", IntegerType(), True),
                    StructField("doc_event_entity_id_5", IntegerType(), True),           
                    StructField("doc_event_entity_id_6", IntegerType(), True),
                    StructField("doc_event_publisher_id", IntegerType(), True),
                    StructField("doc_event_source_id", IntegerType(), True),                    
                    StructField("event_country", IntegerType(), True),
                    StructField("event_country_state", IntegerType(), True),
                    StructField("event_geo_location", IntegerType(), True),
                    StructField("event_hour", IntegerType(), True),
                    StructField("event_platform", IntegerType(), True),
                    StructField("traffic_source", IntegerType(), True)])

In [53]:
SparkS = SparkSession.builder.getOrCreate()
train_df = SparkS.createDataFrame(train_feature_vectors_integral_csv_rdd, train_schema)

In [54]:
train_df.show(3)
# 에러 메시지 -> TypeError: field label: IntegerType can not accept object '1' in type <type 'str'>

Py4JJavaError: An error occurred while calling o919.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 23.0 failed 4 times, most recent failure: Lost task 0.3 in stage 23.0 (TID 233, cluster-1129-w-2.us-east1-b.c.numeric-oarlock-223904.internal, executor 26): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 230, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 225, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 673, in prepare
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1421, in verify
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1402, in verify_struct
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1421, in verify
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1347, in verify_integer
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1310, in verify_acceptable_types
TypeError: field label: IntegerType can not accept object '1' in type <type 'str'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3273)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2698)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 230, in main
    process()
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/worker.py", line 225, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/session.py", line 673, in prepare
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1421, in verify
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1402, in verify_struct
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1421, in verify
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1347, in verify_integer
  File "/usr/lib/spark/python/lib/pyspark.zip/pyspark/sql/types.py", line 1310, in verify_acceptable_types
TypeError: field label: IntegerType can not accept object '1' in type <type 'str'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253)
	at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [30]:
# 스파크 데이터프레임으로 바뀐 건 확실한데, show가 안 된다ㅠㅠ
type(train_df)

pyspark.sql.dataframe.DataFrame

## examples

rdd를 (머신러닝에 넣기 좋게) 잘 변형한 예

In [49]:
temp_df = spark.createDataFrame([Row(V4366=0.0, V4460=0.232, V4916=-0.017, V1495=-0.104, V1639=0.005, V1967=-0.008, V3049=0.177, V3746=-0.675, V3869=-3.451, V524=0.004, V5409=0), Row(V4366=0.0, V4460=0.111, V4916=-0.003, V1495=-0.137, V1639=0.001, V1967=-0.01, V3049=0.01, V3746=-0.867, V3869=-2.759, V524=0.0, V5409=0), Row(V4366=0.0, V4460=-0.391, V4916=-0.003, V1495=-0.155, V1639=-0.006, V1967=-0.019, V3049=-0.706, V3746=0.166, V3869=0.189, V524=0.001, V5409=0), Row(V4366=0.0, V4460=0.098, V4916=-0.012, V1495=-0.108, V1639=0.005, V1967=-0.002, V3049=0.033, V3746=-0.787, V3869=-0.926, V524=0.002, V5409=0), Row(V4366=0.0, V4460=0.026, V4916=-0.004, V1495=-0.139, V1639=0.003, V1967=-0.006, V3049=-0.045, V3746=-0.208, V3869=-0.782, V524=0.001, V5409=0)])
temp_df.show()

+------+------+------+------+------+------+-----+------+------+-----+-----+
| V1495| V1639| V1967| V3049| V3746| V3869|V4366| V4460| V4916| V524|V5409|
+------+------+------+------+------+------+-----+------+------+-----+-----+
|-0.104| 0.005|-0.008| 0.177|-0.675|-3.451|  0.0| 0.232|-0.017|0.004|    0|
|-0.137| 0.001| -0.01|  0.01|-0.867|-2.759|  0.0| 0.111|-0.003|  0.0|    0|
|-0.155|-0.006|-0.019|-0.706| 0.166| 0.189|  0.0|-0.391|-0.003|0.001|    0|
|-0.108| 0.005|-0.002| 0.033|-0.787|-0.926|  0.0| 0.098|-0.012|0.002|    0|
|-0.139| 0.003|-0.006|-0.045|-0.208|-0.782|  0.0| 0.026|-0.004|0.001|    0|
+------+------+------+------+------+------+-----+------+------+-----+-----+



In [50]:
trainingData=temp_df.rdd.map(lambda x:(Vectors.dense(x[0:-1]), x[-1])).toDF(["features", "label"])
trainingData.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[-0.104,0.005,-0....|    0|
|[-0.137,0.001,-0....|    0|
|[-0.155,-0.006,-0...|    0|
|[-0.108,0.005,-0....|    0|
|[-0.139,0.003,-0....|    0|
+--------------------+-----+



## configuration

스파크 데이터프레임에서 판다스 데이터프레임으로 변환할 때, maxResultSize가 부족하다고 해서 아래와 같이 size를 늘렸는데, 그래도 안 돌아간다

In [23]:
print(conf.get("spark.driver.maxResultSize"))

3328m


In [33]:
conf.set("spark.driver.maxResultSize", "5000m")

<pyspark.conf.SparkConf at 0x7f50b1ad8310>

In [34]:
print(conf.get("spark.driver.maxResultSize"))

5000m


In [None]:
# another code for configuration
# conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
# sc = SparkContext(conf=conf)

## spark logistic regression

In [38]:
from pyspark.ml.classification import LogisticRegression

# Load training data
training = spark.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(training)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
mlrModel = mlr.fit(training)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

IllegalArgumentException: u'Field "features" does not exist.\nAvailable fields: label, display_id, ad_id, doc_id, doc_event_id, is_leak, event_weekend, user_has_already_viewed_doc, user_views, ad_views, doc_views, doc_event_days_since_published, doc_event_hour, doc_ad_days_since_published, pop_ad_id, pop_ad_id_conf, pop_ad_id_conf_multipl, pop_document_id, pop_document_id_conf, pop_document_id_conf_multipl, pop_publisher_id, pop_publisher_id_conf, pop_publisher_id_conf_multipl, pop_advertiser_id, pop_advertiser_id_conf, pop_advertiser_id_conf_multipl, pop_campain_id, pop_campain_id_conf, pop_campain_id_conf_multipl, pop_doc_event_doc_ad, pop_doc_event_doc_ad_conf, pop_doc_event_doc_ad_conf_multipl, pop_source_id, pop_source_id_conf, pop_source_id_conf_multipl, pop_source_id_country, pop_source_id_country_conf, pop_source_id_country_conf_multipl, pop_entity_id, pop_entity_id_conf, pop_entity_id_conf_multipl, pop_entity_id_country, pop_entity_id_country_conf, pop_entity_id_country_conf_multipl, pop_topic_id, pop_topic_id_conf, pop_topic_id_conf_multipl, pop_topic_id_country, pop_topic_id_country_conf, pop_topic_id_country_conf_multipl, pop_category_id, pop_category_id_conf, pop_category_id_conf_multipl, pop_category_id_country, pop_category_id_country_conf, pop_category_id_country_conf_multipl, user_doc_ad_sim_categories, user_doc_ad_sim_categories_conf, user_doc_ad_sim_categories_conf_multipl, user_doc_ad_sim_topics, user_doc_ad_sim_topics_conf, user_doc_ad_sim_topics_conf_multipl, user_doc_ad_sim_entities, user_doc_ad_sim_entities_conf, user_doc_ad_sim_entities_conf_multipl, doc_event_doc_ad_sim_categories, doc_event_doc_ad_sim_categories_conf, doc_event_doc_ad_sim_categories_conf_multipl, doc_event_doc_ad_sim_topics, doc_event_doc_ad_sim_topics_conf, doc_event_doc_ad_sim_topics_conf_multipl, doc_event_doc_ad_sim_entities, doc_event_doc_ad_sim_entities_conf, doc_event_doc_ad_sim_entities_conf_multipl, ad_advertiser, doc_ad_category_id_1, doc_ad_category_id_2, doc_ad_category_id_3, doc_ad_topic_id_1, doc_ad_topic_id_2, doc_ad_topic_id_3, doc_ad_entity_id_1, doc_ad_entity_id_2, doc_ad_entity_id_3, doc_ad_entity_id_4, doc_ad_entity_id_5, doc_ad_entity_id_6, doc_ad_publisher_id, doc_ad_source_id, doc_event_category_id_1, doc_event_category_id_2, doc_event_category_id_3, doc_event_topic_id_1, doc_event_topic_id_2, doc_event_topic_id_3, doc_event_entity_id_1, doc_event_entity_id_2, doc_event_entity_id_3, doc_event_entity_id_4, doc_event_entity_id_5, doc_event_entity_id_6, doc_event_publisher_id, doc_event_source_id, event_country, event_country_state, event_geo_location, event_hour, event_platform, traffic_source'