In [1]:
# necessary imports
#make sure we don't override time
import time as time_
import pandas
import numpy
import warnings

from collections import Counter, defaultdict

In [2]:
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

def millis():
    return int(round(time_.time() * 1000))


FILE_PATH = "Filtered_Data.csv"
SEPARATOR = ","

CLICK_THRESHOLD_FOR_FEATURE = 1000
CLICK_THRESHOLD_FOR_DOMAIN = 50
CONVERSION_THRESHOLD_FOR_FEATURE = 1000
CONVERSION_THRESHOLD_FOR_DOMAIN = 5
PUBLISHER_DOMAIN = 'enriched_publisher_domain'

MAX_ROWS = 1500
MISCELLANEOUS = 'MISC'
FEATURE_NAMES = ['ad_advertiser_id', 'analyzer_name', 'click_browser', 'enriched_derived_device', 'enriched_derived_os', 'raw_sync_partner_id']

# This line if temporarily for debugging
web_urls = ['www.msn.com', 'www.collegescholarships.org', 'gotquestion.com', 'homebidz.co', 'searchads.local.com', 'www.dg,coupon.com']

# imported the CSV data
dataframe = pandas.read_csv(FILE_PATH, sep = SEPARATOR)


# Drop the unused column
if 'Unnamed: 0' in dataframe.columns:
    dataframe.drop('Unnamed: 0', axis = 1, inplace = True)

    
# Remove the same prefix from all columns names
dataframe.columns  = [ column_name.split('.')[1] for column_name in dataframe.columns ]

# This line only for debugging
# dataframe = dataframe.loc[dataframe['enriched_publisher_domain'].isin(web_urls)]

pandas.set_option('display.max_rows', MAX_ROWS)
dataframe.head()

Unnamed: 0,ts,impression_system_date,hour,conversion_status,cc_id_null_ind,ad_id,click_id,cc_id,hour_of_day,raw_publisher_id,...,click_conf_gmt_offset,click_state,click_city,ad_bid,ad_cpa_goal,ad_market_place_id,click_click_status,kmean_category_name,partner_id,version
0,20190106,2019-01-06,1,1,0,138154,646739319951006038,546739320882001606,20,8CUDMGV55,...,-500,ga,atlanta,0.15,,1027,1,missing,8PRHGG6T9,1
1,20190106,2019-01-06,1,1,0,138154,646739319951006038,546739320882001606,20,8CUDMGV55,...,-500,ga,atlanta,0.15,,1027,1,missing,8PRHGG6T9,1
2,20190106,2019-01-06,2,0,0,138154,646740715393002311,346740715723006848,20,8CUDMGV55,...,-600,al,birmingham,0.15,,1027,1,missing,8PRHGG6T9,1
3,20190106,2019-01-06,4,0,0,138154,246748175931002645,446748176329007771,23,8CUDMGV55,...,-500,in,ft wayne,0.15,,1027,1,missing,8PRHGG6T9,1
4,20190106,2019-01-06,6,0,0,128699,246754980029002839,646754981260002933,22,8CUX8874N,...,-800,ca,mountain view,3.394224,,1027,1,missing,8PRHGG6T9,1


In [3]:
# removing all data having single values in entire columns
list_of_single_value_columns = []
for column in dataframe.columns:
    if(dataframe[column].nunique() == 1):
        dataframe.drop(column, axis = 1, inplace = True)
        list_of_single_value_columns.append(column)

print (list_of_single_value_columns)

['cc_id_null_ind', 'serving_key', 'ad_partner_id', 'ad_market_place_id', 'click_click_status', 'kmean_category_name', 'version']


In [4]:
# Remove the timestamp data and impression system date data - results not dependent on the time frame
dataframe.drop('ts', axis = 1, inplace = True)
dataframe.drop('impression_system_date', axis = 1, inplace = True)

# Server hour do not matters
dataframe.drop('hour', axis = 1, inplace = True)

# Data have only 3 values of *au* region, and conversion status 0 for all of them  --- temp commented
dataframe.drop('enriched_country', axis = 1, inplace = True)

# Data is similar to cvr_base_data_table.click_city
dataframe.drop('impression_city', axis = 1, inplace = True)

# Ad-Bid is the cost which is irrespective of conversion from user side
dataframe.drop('ad_bid', axis = 1, inplace = True)

# Ad CPA goal do not affects the conversion status
dataframe.drop('ad_cpa_goal', axis = 1, inplace = True)

# Click Conf GMT Offset do not affects the conversion status
dataframe.drop('click_conf_gmt_offset', axis = 1, inplace = True)

# Date Time do not affects and is too granuler to looku[, thus do not affects the conversion status
dataframe.drop('click_user_datetime', axis = 1, inplace = True)

# Publisher Domain do not affects the conversion status -- this is what we need to find
# dataframe.drop('enriched_publisher_domain', axis = 1, inplace = True)

# Impression ISP Name do not affects the conversion status
dataframe.drop('impression_isp_name', axis = 1, inplace = True)

# below data is too granular to predict something
dataframe.drop('ad_keyword', axis = 1, inplace = True)
dataframe.drop('click_city', axis = 1, inplace = True)
dataframe.drop('raw_sub_publisher_id', axis = 1, inplace = True)
dataframe.drop('category_name', axis = 1, inplace = True)
dataframe.drop('click_id', axis = 1, inplace = True)
dataframe.drop('cc_id', axis = 1, inplace = True)
dataframe.drop('raw_publisher_id', axis = 1, inplace = True)
dataframe.drop('raw_sub_sub_publisher_id', axis = 1, inplace = True)

In [5]:
# Function for aggregation on basis of click, conversion and CVR(conversion ratio)
func = {'conversion_status' : {
    'conversion' : 'sum',
    'click' : 'count'
}}

In [6]:
# Iterate through above columns list and 'misc' the data which is too sparse to use 
# for classification or modelling
def mark_msic_values(cur_dataframe, feature_name, click_threshold, conversion_threshold):
    cur_dfg = cur_dataframe.groupby(feature_name).agg(func).reset_index()
    cur_dfg.columns = cur_dfg.columns.droplevel(0)
    cur_dfg['cvr'] = cur_dfg['conversion'] / cur_dfg['click']
    declareMisc = []
    for index, row in cur_dfg.iterrows():
        if(row['click'] < click_threshold or row['conversion'] < conversion_threshold):
            declareMisc.append(row[''])
    print (declareMisc)
    for field_value in declareMisc:
        cur_dataframe.loc[cur_dataframe[feature_name] == field_value, feature_name] = MISCELLANEOUS
    return cur_dataframe

In [7]:
# Iterate through above columns list and 'misc' the data which is too sparse to use 
# for classification or modelling
for cur_column in FEATURE_NAMES:
    dataframe = mark_msic_values(dataframe, cur_column, CLICK_THRESHOLD_FOR_FEATURE, CONVERSION_THRESHOLD_FOR_FEATURE)
dataframe = mark_msic_values(dataframe, PUBLISHER_DOMAIN, CLICK_THRESHOLD_FOR_DOMAIN, CONVERSION_THRESHOLD_FOR_DOMAIN)

[1351.0, 1352.0, 1375.0, 1387.0, 1454.0, 1458.0, 1492.0, 1517.0, 1518.0, 1521.0, 1608.0, 1619.0, 1630.0, 1633.0, 1637.0, 1640.0, 1641.0]
['broadanalyserv2', 'categoryanalyser', 'contextualanalyser', 'forcedanalyser', 'nofilteranalyser', 'phraseanalyser']
['android,browser', 'android,httpurlconnection', 'applewebkit', 'blackberry,browser', 'chromium', 'edge', 'ie,mobile', 'internet,explorer', 'maxthon', 'mozilla,firefox', 'mozilla,like,unknown', 'nokia', 'opera', 'opera,mini', 'silk', 'ucbrowser', 'unknown']
['tablet', 'unknown,device']
['linux', 'mac', 'unknown,os']
[2.0, 4.0, 11.0]
['03.aba,architects.com', '1061669235,mopub,app.imnapp', '1079life.com', '10offers.net', '1600daily.com', '2016discounts.com', '2017prices.com', '2018discounts.com', '2018prices.com', '2019bmwcars.com', '2019fordrangerraptor.com', '302324249,mopub,app.imnapp', '315241195,mopub,app.imnapp', '319881193,mopub,app.imnapp', '367506176,mopub,app.imnapp', '372648912,mopub,app.imnapp', '4nissanfanz.com', 'a.msn.com

In [8]:
# Now, we need to remove the outliers for certain feature colums
# We will use standard method for removing the outliers
# called as Tukeys method for determining the data need to be removed
# from the original data
# As outliers should be never considered for analysis

In [9]:
# classfiy Miscellaneous points
def get_index_of_outliers(cur_dataframe, feature_name, cur_outlier_indexes):
    cur_dfg = cur_dataframe.groupby(feature_name).agg(func).reset_index()
    cur_dfg.columns = cur_dfg.columns.droplevel(0)
    cur_dfg['cvr'] = cur_dfg['conversion'] / cur_dfg['click']
    range1 = numpy.percentile(cur_dfg['cvr'], 25)
    range2 = numpy.percentile(cur_dfg['cvr'], 75)
    step = 1.5 * (range2 - range1)
    lower_limit = range1 - step
    upper_limit = range2 + step
    cur_indexes = cur_dfg[(cur_dfg['cvr'] < lower_limit) | (cur_dfg['cvr'] > upper_limit)].index
    
    if(len(cur_indexes) == 0):
        return cur_outlier_indexes
    
    label_names = set()
    for idx in cur_indexes:
        label_names.add(cur_dfg.iloc[idx][''])
    
    for feature_value in label_names:
        indexes_in_original_dataframe = cur_dataframe[cur_dataframe[feature_name] == feature_value].index.tolist()
        for index_in_cur_df in indexes_in_original_dataframe:
            cur_outlier_indexes.add(index_in_cur_df)
    return cur_outlier_indexes

In [11]:
final_outlier_indexes = set()

for feature in FEATURE_NAMES:
    final_outlier_indexes = get_index_of_outliers(dataframe, feature, final_outlier_indexes)

final_outlier_indexes = list(final_outlier_indexes)
dataframe = dataframe.drop(final_outlier_indexes, axis = 0).reset_index(drop = True)

print (final_outlier_indexes)

[]


In [46]:
epsilon = float(.1)

def equal_dataframes(cur_df1, cur_df2):
    if(len(cur_df1) != len(cur_df2)):
        return False
    for index, row in cur_df1.iterrows():
        if(abs(cur_df1.iloc[index]['cvr'] - cur_df2.iloc[index]['cvr']) > epsilon):
            return False
    return True

def calcualte_mean(current_dataframe, idx):
    feature_map_sum = defaultdict(float)
    feature_map_count = defaultdict(float)
    feature_map_mean = {}
    idx = int(idx)
    for index, row in current_dataframe.iterrows():
        feature_map_count[row[idx]] = feature_map_count[row[idx]] + int(1)
        feature_map_sum[row[idx]] = feature_map_sum[row[idx]] + row['cvr']
    for key in feature_map_count.keys():
        try:
            feature_map_mean[key] = (1.0 / ((feature_map_sum[key] / feature_map_count[key])))
        except:
            feature_map_mean[key] = 0.0
            print ('Zero Value dectected', key)
    return feature_map_mean
    
def get_normalized_score(original_dataframe, main_feature, bias_feature):
    ct = int(0)
    df_list = []
    original_dfg = original_dataframe.groupby([main_feature, bias_feature]).agg(func).reset_index()
    original_dfg.columns = original_dfg.columns.droplevel(0)
    original_dfg['cvr'] = original_dfg['conversion'] / original_dfg['click']

    current_dfg = original_dfg.copy()
    last_dfg = pandas.DataFrame()
    while(True):        
        last_dfg = current_dfg.copy()
        
        mainF_map_mean = calcualte_mean(current_dfg, 0)
        biasF_map_mean = calcualte_mean(current_dfg, 1)
        
        cur_copy = original_dfg.copy()
        current_dfg['cvr'] = cur_copy.apply(lambda row : float(row['cvr'] * mainF_map_mean[row['']]), axis = 1)
#         for index, row in current_dfg.iterrows():
            
#             current_dfg.set_value(index, 'cvr', float(original_dfg.loc[index][4] * mainF_map_mean[original_dfg.loc[index][0]]))
        
        df_list.append(current_dfg.copy())
        if(equal_dataframes(current_dfg, last_dfg)):
            return df_list, calcualte_mean(current_dfg, 0)
        if(ct == 1000):
            return df_list, calcualte_mean(current_dfg, 0)
        ct += 1
        print (ct)
    return df_list, calcualte_mean(current_dfg, 0)

In [47]:
df_set, map_val = get_normalized_score(dataframe, 'enriched_publisher_domain', 'ad_advertiser_id')

TypeError: ("'Series' objects are mutable, thus they cannot be hashed", 'occurred at index 0')

In [26]:
final_df = dataframe.groupby('enriched_publisher_domain').agg(func).reset_index()
final_df.columns = final_df.columns.droplevel(0)
final_df['cvr'] = final_df['conversion'] / final_df['click']
final_df['Actual_score'] = float(0.0)


for index, row in final_df.iterrows():
    final_df.at[index, 'Actual_score'] = float(map_val[final_df.loc[index][0]])
final_df.sort_values(by = ['Actual_score'], ascending = False)

Unnamed: 0,Unnamed: 1,conversion,click,cvr,Actual_score
0,247wallst.com,13,1006,0.012922,154.615385
75,www.housinglist.com,13,1712,0.007593,131.692308
39,related.homebidz.co,16,1776,0.009009,111.0
20,healthorigins.com,10,811,0.01233,81.1
48,searchanswers.net,5,348,0.014368,69.6
52,ssofficelocations.org,5,336,0.014881,67.2
80,www.newretirement.com,8,268,0.029851,66.75
62,www.cardrewards.net,15,962,0.015593,64.133333
30,m.nasdaq.com,21,1311,0.016018,62.428571
2,allfinance411.com,26,1559,0.016677,59.961538
