In [106]:
import pandas
import numpy
import warnings
from collections import Counter, defaultdict
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

# Variable declarations
FILE_PATH = "Filtered_Data.csv"
SEPARATOR = ","
CLICK_THRESHOLD = 1000
MAX_ROWS = 500
MISCELLANEOUS = 'misc'
FEATURE_NAMES = ['ad_advertiser_id', 'analyzer_name', 'click_browser', 'enriched_derived_device', 'enriched_derived_os', 'raw_sync_partner_id']

# imported the CSV data
dataframe = pandas.read_csv(FILE_PATH, sep = SEPARATOR)

# Drop the unused column
if 'Unnamed: 0' in dataframe.columns:
    dataframe.drop('Unnamed: 0', axis = 1, inplace = True)

# Remove the same prefix from all columns names
dataframe.columns  = [ column_name.split('.')[1] for column_name in dataframe.columns ]

CLICK_THRESHOLD = max(1000, int(float(dataframe.shape[0]) * .01))
dataframe.head()

Unnamed: 0,ts,impression_system_date,hour,conversion_status,cc_id_null_ind,ad_id,click_id,cc_id,hour_of_day,raw_publisher_id,...,click_conf_gmt_offset,click_state,click_city,ad_bid,ad_cpa_goal,ad_market_place_id,click_click_status,kmean_category_name,partner_id,version
0,20190106,2019-01-06,1,1,0,138154,646739319951006038,546739320882001606,20,8CUDMGV55,...,-500,ga,atlanta,0.15,,1027,1,missing,8PRHGG6T9,1
1,20190106,2019-01-06,1,1,0,138154,646739319951006038,546739320882001606,20,8CUDMGV55,...,-500,ga,atlanta,0.15,,1027,1,missing,8PRHGG6T9,1
2,20190106,2019-01-06,2,0,0,138154,646740715393002311,346740715723006848,20,8CUDMGV55,...,-600,al,birmingham,0.15,,1027,1,missing,8PRHGG6T9,1
3,20190106,2019-01-06,4,0,0,138154,246748175931002645,446748176329007771,23,8CUDMGV55,...,-500,in,ft wayne,0.15,,1027,1,missing,8PRHGG6T9,1
4,20190106,2019-01-06,6,0,0,128699,246754980029002839,646754981260002933,22,8CUX8874N,...,-800,ca,mountain view,3.394224,,1027,1,missing,8PRHGG6T9,1


In [107]:
# removing all data having single values in entire columns
for column in dataframe.columns:
    if(dataframe[column].nunique() == 1):
        dataframe.drop(column, axis = 1, inplace = True)

In [108]:
# Remove the timestamp data and impression system date data - results not dependent on the time frame
dataframe.drop('ts', axis = 1, inplace = True)
dataframe.drop('impression_system_date', axis = 1, inplace = True)

# Server hour do not matters
dataframe.drop('hour', axis = 1, inplace = True)

# Data have only 3 values of *au* region, and conversion status 0 for all of them
dataframe.drop('enriched_country', axis = 1, inplace = True)

# Data is similar to cvr_base_data_table.click_city
dataframe.drop('impression_city', axis = 1, inplace = True)

# Ad-Bid is the cost which is irrespecti   ve of conversion from user side
dataframe.drop('ad_bid', axis = 1, inplace = True)

# Ad CPA goal do not affects the conversion status
dataframe.drop('ad_cpa_goal', axis = 1, inplace = True)

# Click Conf GMT Offset do not affects the conversion status
dataframe.drop('click_conf_gmt_offset', axis = 1, inplace = True)

# Date Time do not affects and is too granuler to looku[, thus do not affects the conversion status
dataframe.drop('click_user_datetime', axis = 1, inplace = True)

# Publisher Domain do not affects the conversion status
# dataframe.drop('enriched_publisher_domain', axis = 1, inplace = True)

# Impression ISP Name do not affects the conversion status
dataframe.drop('impression_isp_name', axis = 1, inplace = True)

# below data is too granular to predict something
dataframe.drop('ad_keyword', axis = 1, inplace = True)
dataframe.drop('click_city', axis = 1, inplace = True)
dataframe.drop('raw_sub_publisher_id', axis = 1, inplace = True)
dataframe.drop('category_name', axis = 1, inplace = True)
dataframe.drop('click_id', axis = 1, inplace = True)
dataframe.drop('cc_id', axis = 1, inplace = True)
dataframe.drop('raw_publisher_id', axis = 1, inplace = True)
dataframe.drop('raw_sub_sub_publisher_id', axis = 1, inplace = True)

In [109]:
dataframe.head()

Unnamed: 0,conversion_status,ad_id,hour_of_day,enriched_derived_device,enriched_derived_os,enriched_publisher_domain,ad_adgroup_id,ad_advertiser_id,ad_campaign_id,impression_state,impression_asn_code,impression_connection_type,analyzer_name,click_browser,raw_sync_partner_id,click_state,partner_id
0,1,138154,20,mobile,android,"www.dg,coupon.com",94663,1518,86975,ga,22394,mobile,keywordanalyserv2,"google,chrome",1,ga,8PRHGG6T9
1,1,138154,20,mobile,android,"www.dg,coupon.com",94663,1518,86975,ga,22394,mobile,keywordanalyserv2,"google,chrome",1,ga,8PRHGG6T9
2,0,138154,20,mobile,android,"www.dg,coupon.com",94663,1518,86975,al,21928,mobile,keywordanalyserv2,"google,chrome",1,al,8PRHGG6T9
3,0,138154,23,mobile,android,"www.dg,coupon.com",94663,1518,86975,in,7922,cable,keywordanalyserv2,"google,chrome",1,in,8PRHGG6T9
4,0,128699,22,mobile,android,results.indiaresults.com,81679,1458,73823,ca,15169,broadband,keywordanalyserv2,"google,chrome",1,ca,8PRHGG6T9


In [110]:
# Following column names have less entries thus can be relabelled
total = dataframe.shape[0]
pandas.set_option('display.max_rows', MAX_ROWS)

# Function for aggregation on basis of click, conversion and CVR(conversion ratio)
func = {'conversion_status' : {
    'conversion' : 'sum',
    'click' : 'count'
}}

# Iterate through above columns list and 'misc' the data which is too sparse to use 
# for classification or modelling
for col in FEATURE_NAMES:
    dfg = dataframe.groupby(col).agg(func).reset_index()
    dfg.columns = dfg.columns.droplevel(0)
    dfg['cvr'] = dfg['conversion'] / dfg['click']
    dfg['click percentage of total data'] = dfg['click'] / total * 100
    declareMisc = set()
    for index, row in dfg.iterrows():
        if(row['click'] >= CLICK_THRESHOLD):
            continue
        declareMisc.add(row[''])
        
    for vals in declareMisc:
        dataframe.loc[dataframe[col] == vals, col] = MISCELLANEOUS

In [111]:
# Now, we need to remove the outliers for certain feature colums
# We will use standardize method for removing the outliers
# We will use Tukeys method for determining the data need to be removed
# from the original data
# As outliers are never considered for analysis

In [112]:
dataframe['ad_advertiser_id'].value_counts()

1387    30042
1458    21280
1637    13763
1608    10848
1619     7491
1640     5422
1581     5397
1461     4666
1375     4468
1518     3869
1454     3199
1352     3192
1492     2985
1521     2000
1517     1890
1641     1500
1630     1478
misc      845
Name: ad_advertiser_id, dtype: int64

In [113]:
final_indexes = set()
for col in FEATURE_NAMES:
    print (col, '{}{}')
    dfg = dataframe.groupby(col).agg(func).reset_index()
    dfg.columns = dfg.columns.droplevel(0)
    dfg['cvr'] = dfg['conversion'] / dfg['click']
    range1 = numpy.percentile(dfg['cvr'], 25)
    range2 = numpy.percentile(dfg['cvr'], 75)
    step = 1.5 * (range2 - range1)
    lower_limit = range1 - step
    upper_limit = range2 + step
    indexes = dfg[(dfg['cvr'] < lower_limit) | (dfg['cvr'] > upper_limit)].index
    
    if(len(indexes) == 0):
        print ('\n\n')
        continue
    
    labels_names = set()
    for index in indexes:
        labels_names.add(dfg.iloc[index][''])
    print (labels_names)
    for col_value in labels_names:
        indexes_value_in_dataframe = dataframe[dataframe[col] == col_value].index.tolist()
        for index_in_dataframe in indexes_value_in_dataframe:
            final_indexes.add(index_in_dataframe)
    print ('\n')

final_indexes = list(final_indexes)
dataframe = dataframe.drop(final_indexes, axis = 0).reset_index(drop = True)
print (final_indexes)
print (dataframe.shape[0], dataframe.shape[1], len(final_indexes) , dataframe.shape[0] - len(final_indexes))

ad_advertiser_id {}{}
{1581}


analyzer_name {}{}



click_browser {}{}
{'misc'}


enriched_derived_device {}{}
{'misc'}


enriched_derived_os {}{}
{'misc'}


raw_sync_partner_id {}{}



[32770, 98313, 32796, 98338, 98342, 32811, 65580, 98347, 98348, 32815, 48, 32822, 65590, 98371, 32843, 98383, 65617, 65628, 32883, 116, 32887, 65668, 135, 140, 141, 98449, 98451, 32935, 65713, 185, 186, 188, 215, 32984, 229, 65766, 65782, 33020, 254, 256, 65798, 283, 33056, 290, 33059, 33060, 33061, 33062, 33064, 98609, 98613, 98614, 98618, 354, 361, 65906, 98681, 98682, 381, 382, 98693, 393, 397, 65936, 406, 414, 98720, 65957, 65958, 65969, 65971, 65972, 33213, 33219, 65991, 66000, 465, 66001, 98768, 66004, 33265, 33269, 33270, 66037, 509, 33290, 523, 33307, 66078, 552, 33326, 66098, 33341, 575, 98881, 66119, 584, 66120, 98891, 33361, 98900, 66136, 601, 604, 33378, 624, 626, 627, 33399, 632, 33423, 657, 658, 33427, 66194, 66199, 98972, 98974, 98975, 66216, 66217, 686, 66223, 66235, 33470, 33472, 705, 

118209 17 6126 112083


In [114]:
# Below here we will add the bucketizing method
# for FEATURE_NAMES respective values

In [196]:
epsilon = float(.01)

def equal_dataframes(df1, df2):
    if(len(df1) != len(df2)):
        return False
    for index, row in df1.iterrows():
        if(abs(df1.iloc[index]['cvr'] - df2.iloc[index]['cvr']) > epsilon):
            return False
    return True

def calcualte_mean(current_dataframe, idx):
    feature_map_sum = defaultdict(float)
    feature_map_count = defaultdict(float)
    feature_map_mean = {}
    idx = int(idx)
    for index, row in current_dataframe.iterrows():
        feature_map_count[row[idx]] = feature_map_count[row[idx]] + 1.0
        feature_map_sum[row[idx]] = feature_map_sum[row[idx]] + row['cvr']
    for key in feature_map_count.keys():
        try:
            feature_map_mean[key] = (1.0 / ((feature_map_sum[key] / feature_map_count[key])))
        except:
            feature_map_mean[key] = 0.0
    return feature_map_mean
    
def get_normalized_score(original_dataframe, main_feature, bias_feature):
    original_dfg = original_dataframe.groupby([main_feature, bias_feature]).agg(func).reset_index()
    original_dfg.columns = original_dfg.columns.droplevel(0)
    original_dfg['cvr'] = original_dfg['conversion'] / original_dfg['click']
#     print ('Inside function')
    current_dfg = original_dfg.copy()
    last_dfg = pandas.DataFrame()
#     print ('Passed the last and current')
#     print (original_dfg.head(), '\n\n')
    while(equal_dataframes(current_dfg, last_dfg) is not True):
        last_dfg = current_dfg.copy()
        print ('Inside while loop')
        mainF_map_sum = defaultdict(float)
        mainF_map_count = defaultdict(float)
        
        biasF_map_sum = defaultdict(float)
        biasF_map_count = defaultdict(float)
        
        mainF_map_mean = calcualte_mean(current_dfg, 0)
        biasF_map_mean = calcualte_mean(current_dfg, 1)
        
        for index, row in current_dfg.iterrows():
            current_dfg.loc[index]['cvr'] = (original_dfg.loc[index]['cvr']) * (mainF_map_mean[original_dfg.loc[index][0]])
    return current_dfg, calcualte_mean(current_dfg, 0)

In [199]:
dfr, map_val = get_normalized_score(dataframe, 'enriched_publisher_domain', 'ad_advertiser_id')

for w in sorted(map_val, key = map_val.get, reverse = True):
    print (w, map_val[w])

Inside while loop
www.rentlingo.com 1024.0
www.everyrent.com 482.0
www.cnn.com 259.0
247wallst.com 246.96109059926158
topsweeps.com 232.5
mediaalert.news 232.0
www.housinglist.com 226.6153846153846
related.homebidz.co 194.1059947871416
m.nasdaq.com 167.61904761904762
gotquestion.com 148.64319596438165
www.moneychimp.com 140.0
scholarshipowl.com 134.5
healthorigins.com 128.7
socialsecurityhop.com 96.0
searchanswers.net 95.49685534591194
money.cnn.com 84.0
hometipper.com 82.4
homebidz.co 81.816091954023
abcnews.go.com 81.5609756097561
simplifyanswer.com 81.55159571420386
www.carcarekiosk.com 76.0
financialexpress.com 75.0
www.moneycontrol.com 74.0
myaffordablehousingguide.com 73.0
www.cnet.com 67.9
cutestat.com 66.0
www.officialhousingauthority.com 66.0
topavailablejobs.com 61.333333333333336
weather.com 60.0
www.fedprimerate.com 60.0
315241195,mopub,app.imnapp 57.0
singlemoms.org 57.0
www.activatecreditcard.com 54.0
themilitarywallet.com 53.20736842105263
generalcontractorlicenseguide.c

www.jobdiagnosis.com 0.0
www.jobsforfelonshub.com 0.0
www.jolietcna.jobs 0.0
www.joystudiodesign.com 0.0
www.kcconfidential.com 0.0
www.kitco.com 0.0
www.klientsolutech.com 0.0
www.kyivpost.com 0.0
www.labradortraininghq.com 0.0
www.ladylake.jobs 0.0
www.last.fm 0.0
www.latest,hairstyles.com 0.0
www.latestcarpreview.com 0.0
www.latimes.com 0.0
www.lawweb.in 0.0
www.leaderviral.com 0.0
www.learnmarket.in 0.0
www.leavedebtbehind.com 0.0
www.leegna.com 0.0
www.legalbites.in 0.0
www.leisurefreak.com 0.0
www.letsstudytogether.co 0.0
www.libertyheadlines.com 0.0
www.lifeoncredit.ca 0.0
www.lightskincure.org 0.0
www.littlehouseliving.com 0.0
www.liveabout.com 0.0
www.longtermlettings.com 0.0
www.losangelesrealtor.com 0.0
www.lotterypost.com 0.0
www.maravipost.com 0.0
www.marketbeat.com 0.0
www.marketsguruji.com 0.0
www.marthastewart.com 0.0
www.mathplayground.com 0.0
www.mathplusfun.com 0.0
www.matrixdisclosure.com 0.0
www.mcdonough.jobs 0.0
www.medbroadcast.com 0.0
www.medhealthdaily.com 0.0