In [184]:
# necessary imports
#make sure we don't override time
import time as time_
import pandas
import numpy
import warnings

from collections import Counter, defaultdict

In [185]:
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

def millis():
    return int(round(time_.time() * 1000))


FILE_PATH = "Filtered_Data.csv"
SEPARATOR = ","

CLICK_THRESHOLD_FOR_FEATURE = 1000
CLICK_THRESHOLD_FOR_DOMAIN = 10
CONVERSION_THRESHOLD_FOR_FEATURE = 100
CONVERSION_THRESHOLD_FOR_DOMAIN = 5
PUBLISHER_DOMAIN = 'enriched_publisher_domain'

MAX_ROWS = 1500
MISCELLANEOUS = 'MISC'
FEATURE_NAMES = ['ad_advertiser_id', 'analyzer_name', 'click_browser', 'enriched_derived_device', 'enriched_derived_os', 'raw_sync_partner_id']

# This line if temporarily for debugging
web_urls = ['www.msn.com', 'www.collegescholarships.org', 'gotquestion.com', 'homebidz.co', 'searchads.local.com', 'www.dg,coupon.com']

# imported the CSV data
dataframe = pandas.read_csv(FILE_PATH, sep = SEPARATOR)


# Drop the unused column
if 'Unnamed: 0' in dataframe.columns:
    dataframe.drop('Unnamed: 0', axis = 1, inplace = True)

    
# Remove the same prefix from all columns names
dataframe.columns  = [ column_name.split('.')[1] for column_name in dataframe.columns ]

# This line only for debugging
# dataframe = dataframe.loc[dataframe['enriched_publisher_domain'].isin(web_urls)]

pandas.set_option('display.max_rows', MAX_ROWS)
dataframe.head()

Unnamed: 0,ts,impression_system_date,hour,conversion_status,cc_id_null_ind,ad_id,click_id,cc_id,hour_of_day,raw_publisher_id,...,click_conf_gmt_offset,click_state,click_city,ad_bid,ad_cpa_goal,ad_market_place_id,click_click_status,kmean_category_name,partner_id,version
0,20190106,2019-01-06,1,1,0,138154,646739319951006038,546739320882001606,20,8CUDMGV55,...,-500,ga,atlanta,0.15,,1027,1,missing,8PRHGG6T9,1
1,20190106,2019-01-06,1,1,0,138154,646739319951006038,546739320882001606,20,8CUDMGV55,...,-500,ga,atlanta,0.15,,1027,1,missing,8PRHGG6T9,1
2,20190106,2019-01-06,2,0,0,138154,646740715393002311,346740715723006848,20,8CUDMGV55,...,-600,al,birmingham,0.15,,1027,1,missing,8PRHGG6T9,1
3,20190106,2019-01-06,4,0,0,138154,246748175931002645,446748176329007771,23,8CUDMGV55,...,-500,in,ft wayne,0.15,,1027,1,missing,8PRHGG6T9,1
4,20190106,2019-01-06,6,0,0,128699,246754980029002839,646754981260002933,22,8CUX8874N,...,-800,ca,mountain view,3.394224,,1027,1,missing,8PRHGG6T9,1


In [186]:
# removing all data having single values in entire columns
list_of_single_value_columns = []
for column in dataframe.columns:
    if(dataframe[column].nunique() == 1):
        dataframe.drop(column, axis = 1, inplace = True)
        list_of_single_value_columns.append(column)

print (list_of_single_value_columns)

['cc_id_null_ind', 'serving_key', 'ad_partner_id', 'ad_market_place_id', 'click_click_status', 'kmean_category_name', 'version']


In [187]:
# Remove the timestamp data and impression system date data - results not dependent on the time frame
dataframe.drop('ts', axis = 1, inplace = True)
dataframe.drop('impression_system_date', axis = 1, inplace = True)

# Server hour do not matters
dataframe.drop('hour', axis = 1, inplace = True)

# Data have only 3 values of *au* region, and conversion status 0 for all of them  --- temp commented
dataframe.drop('enriched_country', axis = 1, inplace = True)

# Data is similar to cvr_base_data_table.click_city
dataframe.drop('impression_city', axis = 1, inplace = True)

# Ad-Bid is the cost which is irrespective of conversion from user side
dataframe.drop('ad_bid', axis = 1, inplace = True)

# Ad CPA goal do not affects the conversion status
dataframe.drop('ad_cpa_goal', axis = 1, inplace = True)

# Click Conf GMT Offset do not affects the conversion status
dataframe.drop('click_conf_gmt_offset', axis = 1, inplace = True)

# Date Time do not affects and is too granuler to looku[, thus do not affects the conversion status
dataframe.drop('click_user_datetime', axis = 1, inplace = True)

# Publisher Domain do not affects the conversion status -- this is what we need to find
# dataframe.drop('enriched_publisher_domain', axis = 1, inplace = True)

# Impression ISP Name do not affects the conversion status
dataframe.drop('impression_isp_name', axis = 1, inplace = True)

# below data is too granular to predict something
dataframe.drop('ad_keyword', axis = 1, inplace = True)
dataframe.drop('click_city', axis = 1, inplace = True)
dataframe.drop('raw_sub_publisher_id', axis = 1, inplace = True)
dataframe.drop('category_name', axis = 1, inplace = True)
dataframe.drop('click_id', axis = 1, inplace = True)
dataframe.drop('cc_id', axis = 1, inplace = True)
dataframe.drop('raw_publisher_id', axis = 1, inplace = True)
dataframe.drop('raw_sub_sub_publisher_id', axis = 1, inplace = True)

In [188]:
# Function for aggregation on basis of click, conversion and CVR(conversion ratio)
func = {'conversion_status' : {
    'conversion' : 'sum',
    'click' : 'count'
}}

In [189]:
# Iterate through above columns list and 'misc' the data which is too sparse to use 
# for classification or modelling
def mark_msic_values(cur_dataframe, feature_name, click_threshold, conversion_threshold):
    cur_dfg = cur_dataframe.groupby(feature_name).agg(func).reset_index()
    cur_dfg.columns = cur_dfg.columns.droplevel(0)
    cur_dfg['cvr'] = cur_dfg['conversion'] / cur_dfg['click']
    
    print (cur_dfg.head(10))
    
    declareMisc = []
    for index, row in cur_dfg.iterrows():
        if(row['click'] < click_threshold or row['conversion'] < conversion_threshold):
            declareMisc.append(row[''])
    print (declareMisc)
    for field_value in declareMisc:
        cur_dataframe.loc[cur_dataframe[feature_name] == field_value, feature_name] = MISCELLANEOUS
    return cur_dataframe

In [190]:
# Iterate through above columns list and 'misc' the data which is too sparse to use 
# for classification or modelling
for cur_column in FEATURE_NAMES:
    dataframe = mark_msic_values(dataframe, cur_column, CLICK_THRESHOLD_FOR_FEATURE, CONVERSION_THRESHOLD_FOR_FEATURE)
dataframe = mark_msic_values(dataframe, PUBLISHER_DOMAIN, CLICK_THRESHOLD_FOR_DOMAIN, CONVERSION_THRESHOLD_FOR_DOMAIN)
dataframe

         conversion  click       cvr
0  1351           7    252  0.027778
1  1352         354   3192  0.110902
2  1375           0   4468  0.000000
3  1387          22  30042  0.000732
4  1454           0   3199  0.000000
5  1458         800  21280  0.037594
6  1461        1412   4666  0.302615
7  1492         576   2985  0.192965
8  1517           0   1890  0.000000
9  1518         685   3869  0.177048
[1351.0, 1375.0, 1387.0, 1454.0, 1517.0, 1521.0, 1630.0, 1633.0, 1640.0]
                       conversion   click       cvr
0     broadanalyserv2          12     233  0.051502
1    categoryanalyser          42     352  0.119318
2  contextualanalyser         233    1469  0.158611
3      forcedanalyser           6    1904  0.003151
4   keywordanalyserv2        6521  104031  0.062683
5    nofilteranalyser           1    8809  0.000114
6      phraseanalyser         486    7537  0.064482
['broadanalyserv2', 'categoryanalyser', 'forcedanalyser', 'nofilteranalyser']
                          

Unnamed: 0,conversion_status,ad_id,hour_of_day,enriched_derived_device,enriched_derived_os,enriched_publisher_domain,ad_adgroup_id,ad_advertiser_id,ad_campaign_id,impression_state,impression_asn_code,impression_connection_type,analyzer_name,click_browser,raw_sync_partner_id,click_state,partner_id
0,1,138154,20,mobile,android,"www.dg,coupon.com",94663,1518,86975,ga,22394,mobile,keywordanalyserv2,"google,chrome",1,ga,8PRHGG6T9
1,1,138154,20,mobile,android,"www.dg,coupon.com",94663,1518,86975,ga,22394,mobile,keywordanalyserv2,"google,chrome",1,ga,8PRHGG6T9
2,0,138154,20,mobile,android,"www.dg,coupon.com",94663,1518,86975,al,21928,mobile,keywordanalyserv2,"google,chrome",1,al,8PRHGG6T9
3,0,138154,23,mobile,android,"www.dg,coupon.com",94663,1518,86975,in,7922,cable,keywordanalyserv2,"google,chrome",1,in,8PRHGG6T9
4,0,128699,22,mobile,android,MISC,81679,1458,73823,ca,15169,broadband,keywordanalyserv2,"google,chrome",1,ca,8PRHGG6T9
5,0,150368,22,mobile,android,MISC,99434,1637,87260,ca,15169,broadband,keywordanalyserv2,"google,chrome",1,ca,8PRHGG6T9
6,0,138155,23,mobile,android,"www.dg,coupon.com",94663,1518,86975,ca,7018,xdsl,keywordanalyserv2,"google,chrome",1,ca,8PRHGG6T9
7,1,138155,7,mobile,android,"www.dg,coupon.com",94663,1518,86975,tx,21928,mobile,keywordanalyserv2,"google,chrome",1,tx,8PRHGG6T9
8,0,138155,8,mobile,android,"www.dg,coupon.com",94663,1518,86975,in,22394,mobile,keywordanalyserv2,"google,chrome",1,in,8PRHGG6T9
9,0,150711,8,desktop,windows,MISC,100095,1641,87285,tx,11343,broadband,keywordanalyserv2,"mozilla,firefox",1,tx,8PRN625DH


In [191]:
# Now, we need to remove the outliers for certain feature colums
# We will use standard method for removing the outliers
# called as Tukeys method for determining the data need to be removed
# from the original data
# As outliers should be never considered for analysis

In [192]:
# classfiy Miscellaneous points
def get_index_of_outliers(cur_dataframe, feature_name, cur_outlier_indexes):
    cur_dfg = cur_dataframe.groupby(feature_name).agg(func).reset_index()
    cur_dfg.columns = cur_dfg.columns.droplevel(0)
    cur_dfg['cvr'] = cur_dfg['conversion'] / cur_dfg['click']
    range1 = numpy.percentile(cur_dfg['cvr'], 25)
    range2 = numpy.percentile(cur_dfg['cvr'], 75)
    step = 3.0 * (range2 - range1)
    lower_limit = range1 - step
    upper_limit = range2 + step
    cur_indexes = cur_dfg[(cur_dfg['cvr'] < lower_limit) | (cur_dfg['cvr'] > upper_limit)].index
    
    if(len(cur_indexes) == 0):
        return cur_outlier_indexes
    
    label_names = set()
    for idx in cur_indexes:
        label_names.add(cur_dfg.iloc[idx][''])
    print (label_names)
    for feature_value in label_names:
        indexes_in_original_dataframe = cur_dataframe[cur_dataframe[feature_name] == feature_value].index.tolist()
        for index_in_cur_df in indexes_in_original_dataframe:
            cur_outlier_indexes.add(index_in_cur_df)
    return cur_outlier_indexes

In [193]:
final_outlier_indexes = set()

for feature in FEATURE_NAMES:
    final_outlier_indexes = get_index_of_outliers(dataframe, feature, final_outlier_indexes)

final_outlier_indexes = list(final_outlier_indexes)
dataframe = dataframe.drop(final_outlier_indexes, axis = 0).reset_index(drop = True)


{'internet,explorer'}


In [231]:
epsilon = float(.1)

def equal_dataframes(cur_df1, cur_df2):
    if(len(cur_df1) != len(cur_df2)):
        return False
    for index, row in cur_df1.iterrows():
        if(abs(cur_df1.iloc[index]['cvr'] - cur_df2.iloc[index]['cvr']) > epsilon):
            return False
    return True
def _equal_maps(map1, map2):
    if(len(map1) != len(map2)):
        return False
    k1 = map1.keys()
    k2 = map2.keys()
    
    for key in map1.keys():
        if(abs(map1[key] - map2[key]) > epsilon):
            return False
    return True

def _calculate_mean(current_dfg):
    print ('Inside _calculate_mean')
    f1_sum = defaultdict(float)
    f2_sum = defaultdict(float)
    f1_count = defaultdict(int)
    f2_count = defaultdict(int)
    
    print ('Inside _calculate_mean: Going in for loop')
    for index, row in current_dfg.iterrows():
        f1_sum[current_dfg.loc[index][0]] += row['cvr']
        f1_count[current_dfg.loc[index][0]] += 1
        f2_sum[current_dfg.loc[index][1]] += row['cvr']
        f2_count[current_dfg.loc[index][1]] += 1
    
    f1_mean = defaultdict(float)
    f2_mean = defaultdict(float)
    print ('Inside _calculate_mean: Calculating mean for both values')
    for key in f1_sum.keys():
        f1_mean[key] = f1_sum[key] / f1_count[key]
    for key in f2_sum.keys():
        f2_mean[key] = f2_sum[key] / f2_count[key]
    
    print ('Inside _calculate_mean: Done')
    return f1_mean, f2_mean

def _normalize_score(cur_dataframe, main_feature, bias_feature):
    print ('Inside _normalize_score')
    ct = int(0)
    
    # Get DFG for Domain and Ad_id
    grouping_columns = [main_feature, bias_feature]
    orginal_dfg = cur_dataframe.groupby(grouping_columns).agg(func).reset_index()
    orginal_dfg.columns = orginal_dfg.columns.droplevel(0)
    orginal_dfg['cvr'] = orginal_dfg['conversion'] / orginal_dfg['click']
    
    current_dfg = orginal_dfg.copy()
    last_dfg = pandas.DataFrame()
    # current data stored in [domain, ad_id, conversion, click, cvr]
    # index of the data is      0        1        2        3     4
    
    
    print ('Inside _normalize_score: Calculating mean of basic variables')
    # Calculate Avg of Domain and Ad_id
    mainF_mean, biasF_mean = _calculate_mean(current_dfg)
    
    
    print ('Inside _normalize_score: Putting 1 values in multiplier')
    # Reset multiplier map for domains to 1
    multiplier = defaultdict(float)
    for cur_domain in orginal_dfg.ix[:,0]:
        multiplier[cur_domain] = 1.0
    
    print ('Inside _normalize_score: now going in while loop')
    while(True):
        new_dfg = current_dfg.copy()
        
        # Calculate new values according to new_value = original_value * multilpier / average of Domain
        for index, row in new_dfg.iterrows():
            v1 = orginal_dfg.loc[index][4]
            v2 = multiplier[orginal_dfg.loc[index][0]]
            v3 = mainF_mean[orginal_dfg.loc[index][0]]
            print ('Inside _normalize_score: Inside first for loop:: domain -> ', orginal_dfg.loc[index][0])
            print ('Inside _normalize_score: Inside first for loop:: ', v1, v2, v3)
            try:
                new_dfg.at[index, 'cvr'] = v1 * v2 / v3
            except:
                print ('Zero value came unexpected')
        
        # Calculate the average values of ad_id from new dfg
        sum_of_ad_id = defaultdict(float)
        count_of_ad_id = defaultdict(int)
        for index, row in new_dfg.iterrows():
            sum_of_ad_id[new_dfg.loc[index][1]] += new_dfg.loc[index][4]
            count_of_ad_id[new_dfg.loc[index][1]] += 1
        
        average_of_ad_id = defaultdict(float)
        for key in sum_of_ad_id.keys():
            average_of_ad_id[key] = sum_of_ad_id[key] / count_of_ad_id[key]
        
        # Calculate the average values of domain using average scores of ad_id
        sum_of_domain = defaultdict(float)
        count_domain = defaultdict(int)
        for index, row in new_dfg.iterrows():
            sum_of_domain[new_dfg.loc[index][0]] += average_of_ad_id[new_dfg.loc[index][1]]
            count_domain[new_dfg.loc[index][0]] += 1
        mean_of_domain = defaultdict(float)
        for key in sum_of_domain.keys():
            mean_of_domain[key] = sum_of_domain[key] / count_domain[key]
        
        # Compare new_values with old values
        if(_equal_maps(mean_of_domain, multiplier)):
            break
        multiplier = mean_of_domain
    return multiplier

In [232]:
# dataframe
map_val = _normalize_score(dataframe, 'enriched_publisher_domain', 'ad_advertiser_id')

Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Z

Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Z

Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Zero value came unexpected
Z

In [None]:
final_df = dataframe.groupby('enriched_publisher_domain').agg(func).reset_index()
final_df.columns = final_df.columns.droplevel(0)
final_df['cvr'] = final_df['conversion'] / final_df['click']
final_df['Actual_score'] = float(0.0)


for index, row in final_df.iterrows():
    final_df.at[index, 'Actual_score'] = float(map_val[final_df.loc[index][0]])
final_df.sort_values(by = ['Actual_score'], ascending = True)