In [62]:
# necessary imports
import pandas
import numpy
import warnings
from collections import Counter, defaultdict

In [63]:
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

# Variable declarations
FILE_PATH = "Filtered_Data.csv"
SEPARATOR = ","

CLICK_THRESHOLD_FOR_FEATURE = 1000
CLICK_THRESHOLD_FOR_DOMAIN = 50
CONVERSION_THRESHOLD_FOR_FEATURE = 1000
CONVERSION_THRESHOLD_FOR_DOMAIN = 15

MAX_ROWS = 1500
MISCELLANEOUS = 'misc'
FEATURE_NAMES = ['ad_advertiser_id', 'analyzer_name', 'click_browser', 'enriched_derived_device', 'enriched_derived_os', 'raw_sync_partner_id']

# imported the CSV data
dataframe = pandas.read_csv(FILE_PATH, sep = SEPARATOR)

# Drop the unused column
if 'Unnamed: 0' in dataframe.columns:
    dataframe.drop('Unnamed: 0', axis = 1, inplace = True)

# Remove the same prefix from all columns names
dataframe.columns  = [ column_name.split('.')[1] for column_name in dataframe.columns ]
pandas.set_option('display.max_rows', MAX_ROWS)
dataframe.head()

Unnamed: 0,ts,impression_system_date,hour,conversion_status,cc_id_null_ind,ad_id,click_id,cc_id,hour_of_day,raw_publisher_id,...,click_conf_gmt_offset,click_state,click_city,ad_bid,ad_cpa_goal,ad_market_place_id,click_click_status,kmean_category_name,partner_id,version
0,20190106,2019-01-06,1,1,0,138154,646739319951006038,546739320882001606,20,8CUDMGV55,...,-500,ga,atlanta,0.15,,1027,1,missing,8PRHGG6T9,1
1,20190106,2019-01-06,1,1,0,138154,646739319951006038,546739320882001606,20,8CUDMGV55,...,-500,ga,atlanta,0.15,,1027,1,missing,8PRHGG6T9,1
2,20190106,2019-01-06,2,0,0,138154,646740715393002311,346740715723006848,20,8CUDMGV55,...,-600,al,birmingham,0.15,,1027,1,missing,8PRHGG6T9,1
3,20190106,2019-01-06,4,0,0,138154,246748175931002645,446748176329007771,23,8CUDMGV55,...,-500,in,ft wayne,0.15,,1027,1,missing,8PRHGG6T9,1
4,20190106,2019-01-06,6,0,0,128699,246754980029002839,646754981260002933,22,8CUX8874N,...,-800,ca,mountain view,3.394224,,1027,1,missing,8PRHGG6T9,1


In [64]:
# removing all data having single values in entire columns
list_of_single_value_columns = []
for column in dataframe.columns:
    if(dataframe[column].nunique() == 1):
        dataframe.drop(column, axis = 1, inplace = True)
        list_of_single_value_columns.append(column)
        
print (list_of_single_value_columns)

['cc_id_null_ind', 'serving_key', 'ad_partner_id', 'ad_market_place_id', 'click_click_status', 'kmean_category_name', 'version']


In [65]:
# Remove the timestamp data and impression system date data - results not dependent on the time frame
dataframe.drop('ts', axis = 1, inplace = True)
dataframe.drop('impression_system_date', axis = 1, inplace = True)

# Server hour do not matters
dataframe.drop('hour', axis = 1, inplace = True)

# Data have only 3 values of *au* region, and conversion status 0 for all of them
dataframe.drop('enriched_country', axis = 1, inplace = True)

# Data is similar to cvr_base_data_table.click_city
dataframe.drop('impression_city', axis = 1, inplace = True)

# Ad-Bid is the cost which is irrespecti   ve of conversion from user side
dataframe.drop('ad_bid', axis = 1, inplace = True)

# Ad CPA goal do not affects the conversion status
dataframe.drop('ad_cpa_goal', axis = 1, inplace = True)

# Click Conf GMT Offset do not affects the conversion status
dataframe.drop('click_conf_gmt_offset', axis = 1, inplace = True)

# Date Time do not affects and is too granuler to looku[, thus do not affects the conversion status
dataframe.drop('click_user_datetime', axis = 1, inplace = True)

# Publisher Domain do not affects the conversion status
# dataframe.drop('enriched_publisher_domain', axis = 1, inplace = True)

# Impression ISP Name do not affects the conversion status
dataframe.drop('impression_isp_name', axis = 1, inplace = True)

# below data is too granular to predict something
dataframe.drop('ad_keyword', axis = 1, inplace = True)
dataframe.drop('click_city', axis = 1, inplace = True)
dataframe.drop('raw_sub_publisher_id', axis = 1, inplace = True)
dataframe.drop('category_name', axis = 1, inplace = True)
dataframe.drop('click_id', axis = 1, inplace = True)
dataframe.drop('cc_id', axis = 1, inplace = True)
dataframe.drop('raw_publisher_id', axis = 1, inplace = True)
dataframe.drop('raw_sub_sub_publisher_id', axis = 1, inplace = True)

In [66]:
# Function for aggregation on basis of click, conversion and CVR(conversion ratio)
func = {'conversion_status' : {
    'conversion' : 'sum',
    'click' : 'count'
}}

In [67]:
total = dataframe.shape[0]

# Iterate through above columns list and 'misc' the data which is too sparse to use 
# for classification or modelling
def mark_msic_values(dataframe, feature_name, click_threshold, conversion_threshold):
    dfg = dataframe.groupby(feature_name).agg(func).reset_index()
    dfg.columns = dfg.columns.droplevel(0)
    dfg['cvr'] = dfg['conversion'] / dfg['click']
    dfg['click percentage of total data'] = dfg['click'] / total * 100
    declareMisc = set()
    for index, row in dfg.iterrows():
        if(row['click'] >= click_threshold or row['conversion'] >= conversion_threshold):
            continue
        
        declareMisc.add(row[''])
        
    for vals in declareMisc:
        dataframe.loc[dataframe[col] == vals, col] = MISCELLANEOUS
    return dataframe

# Iterate through above columns list and 'misc' the data which is too sparse to use 
# for classification or modelling
for col in FEATURE_NAMES:
    dataframe = mark_msic_values(dataframe, col, CLICK_THRESHOLD_FOR_FEATURE, CONVERSION_THRESHOLD_FOR_FEATURE)
dataframe = mark_msic_values(dataframe, col, CLICK_THRESHOLD_FOR_DOMAIN, CONVERSION_THRESHOLD_FOR_DOMAIN)

In [68]:
# Now, we need to remove the outliers for certain feature colums
# We will use standard method for removing the outliers
# called as Tukeys method for determining the data need to be removed
# from the original data
# As outliers should be never considered for analysis

In [69]:
# classfiy Miscellaneous points
def get_index_of_outliers(dataframe, feature_name, outlier_indexes):
    dfg = dataframe.groupby(feature_name).agg(func).reset_index()
    dfg.columns = dfg.columns.droplevel(0)
    dfg['cvr'] = dfg['conversion'] / dfg['click']
    range1 = numpy.percentile(dfg['cvr'], 25)
    range2 = numpy.percentile(dfg['cvr'], 75)
    step = 1.5 * (range2 - range1)
    lower_limit = range1 - step
    upper_limit = range2 + step
    indexes = dfg[(dfg['cvr'] < lower_limit) | (dfg['cvr'] > upper_limit)].index
    
    if(len(indexes) == 0):
        return outlier_indexes
    
    label_names = set()
    for index in indexes:
        label_names.add(dfg.iloc[index][''])
    for col_value in label_names:
        indexes_in_original_dataframe = dataframe[dataframe[col] == col_value].index.tolist()
        for index_in_df in indexes_in_original_dataframe:
            outlier_indexes.add(index_in_df)
    return outlier_indexes

In [70]:
final_indexes = set()

    
for col in FEATURE_NAMES:
    final_indexes = get_index_of_outliers(dataframe, col, final_indexes)

final_indexes = list(final_indexes)
dataframe = dataframe.drop(final_indexes, axis = 0).reset_index(drop = True)

In [None]:
epsilon = float(.01)

def equal_dataframes(df1, df2):
    if(len(df1) != len(df2)):
        return False
    for index, row in df1.iterrows():
        if(abs(df1.iloc[index]['cvr'] - df2.iloc[index]['cvr']) > epsilon):
            return False
    return True

def calcualte_mean(current_dataframe, idx):
    feature_map_sum = defaultdict(float)
    feature_map_count = defaultdict(float)
    feature_map_mean = {}
    idx = int(idx)
    for index, row in current_dataframe.iterrows():
        feature_map_count[row[idx]] = feature_map_count[row[idx]] + 1.0
        feature_map_sum[row[idx]] = feature_map_sum[row[idx]] + row['cvr']
    for key in feature_map_count.keys():
        try:
            feature_map_mean[key] = (1.0 / ((feature_map_sum[key] / feature_map_count[key])))
        except:
            feature_map_mean[key] = 0.0
    return feature_map_mean
    
def get_normalized_score(original_dataframe, main_feature, bias_feature):
    original_dfg = original_dataframe.groupby([main_feature, bias_feature]).agg(func).reset_index()
    original_dfg.columns = original_dfg.columns.droplevel(0)
    original_dfg['cvr'] = original_dfg['conversion'] / original_dfg['click']
#     print ('Inside function')
    current_dfg = original_dfg.copy()
    last_dfg = pandas.DataFrame()
#     print ('Passed the last and current')
#     print (original_dfg.head(), '\n\n')
    while(equal_dataframes(current_dfg, last_dfg) is not True):
        last_dfg = current_dfg.copy()
#         print ('Inside while loop')
        mainF_map_sum = defaultdict(float)
        mainF_map_count = defaultdict(float)
        
        biasF_map_sum = defaultdict(float)
        biasF_map_count = defaultdict(float)
        
        mainF_map_mean = calcualte_mean(current_dfg, 0)
        biasF_map_mean = calcualte_mean(current_dfg, 1)
        
        for index, row in current_dfg.iterrows():
            current_dfg.loc[index, 'cvr'] = float(original_dfg.loc[index]['cvr'] * mainF_map_mean[original_dfg.loc[index][0]])
    return current_dfg, calcualte_mean(current_dfg, 0)

In [None]:
dfr, map_val = get_normalized_score(dataframe, 'enriched_publisher_domain', 'ad_advertiser_id')

for w in sorted(map_val, key = map_val.get, reverse = True):
    print (w, map_val[w])

In [None]:
final_df = dataframe.groupby('enriched_publisher_domain').agg(func).reset_index()
final_df.columns = final_df.columns.droplevel(0)
final_df['cvr'] = final_df['conversion'] / final_df['click']
final_df['Actual_score'] = float(0.0)


for index, row in final_df.iterrows():
    final_df.at[index, 'Actual_score'] = float(map_val[final_df.loc[index][0]])
final_df.sort_values(by = ['Actual_score'], ascending = False)

In [None]:
dfg = dataframe.groupby()