# Selecting the top 100 Merchants

In [1]:
import os
import sys  
from pathlib import Path
curr_path = str(Path(os.getcwd()).parent)
sys.path.append(curr_path)
import seaborn as sns

from scripts.constants import *
from scripts.plotting import *
from scripts.misc_changes import *


In [2]:
spark = create_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/13 13:32:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/10/13 13:33:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/10/13 13:33:01 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
# reading all the relevant files as pandas dataframe
PREFIX = "."

average_and_median_dollar_per_tag = pd.read_csv(PREFIX + CURATED_TOP3_AVERAGE_AND_MEDIAN_DOLLAR_PER_TAG_PATH)
average_and_median_fraud_prob = pd.read_csv(PREFIX + CURATED_TOP3_AVERAGE_AND_MEDIAN_FRAUD_PROB_PATH)
average_earning = pd.read_csv(PREFIX + CURATED_TOP3_AVERAGE_EARNING_PATH)
number_of_transactions = pd.read_csv(PREFIX + CURATED_TOP3_NUMBER_OF_TRANSACTIONS_PATH)
revenue_takerate = pd.read_csv(PREFIX + TOP3_INDUSTRY_REVENUE_TAKERATE_PATH)
mapped_industry = pd.read_csv(PREFIX + INDUSTRY_MAPPING_PATH)

In [4]:
# sorting the dataframes by the merchant's abn

average_and_median_dollar_per_tag = average_and_median_dollar_per_tag.sort_values(by=MERCHANT_ABN)
average_and_median_fraud_prob = average_and_median_fraud_prob.sort_values(by=MERCHANT_ABN)
average_earning = average_earning.sort_values(by=MERCHANT_ABN)
number_of_transactions = number_of_transactions.sort_values(by=MERCHANT_ABN)
revenue_takerate = revenue_takerate.sort_values(by=MERCHANT_ABN)

Joining all the dataframes with their common column merchant abn and filling the empty rows with 0 (empty values are found in fraud probability dataframe since not all merchants have fraud probabiity) 

In [5]:
dataframes = [average_and_median_dollar_per_tag,
average_and_median_fraud_prob,
number_of_transactions,
revenue_takerate]

In [6]:
common_column = MERCHANT_ABN  

final_df = dataframes[0]

for df in dataframes[1:]:
    final_df = final_df.merge(df, on=common_column, how='outer')

final_df = final_df.fillna(0)

In [7]:
final_df

Unnamed: 0.1,name_x,merchant_abn,dollar_value_avg,dollar_value_median,name_y,fraud_probability_avg,fraud_probability_median,name,number_of_transactions,Unnamed: 0,revenue_level,take_rate
0,Fusce Company,10206519221,37.385,25.855,0,0.0,0.0,Fusce Company,7476,178,1,6.34
1,Ut Consulting,10462560289,35.432,23.545,0,0.0,0.0,Ut Consulting,1240,179,3,2.95
2,Sed Et Libero PC,10651113986,726.269,560.410,0,0.0,0.0,Sed Et Libero PC,53,0,2,3.25
3,Fermentum Risus Foundation,10702078694,91.776,81.685,0,0.0,0.0,Fermentum Risus Foundation,324,180,1,5.95
4,Per Inceptos Company,10901349044,105.730,92.220,0,0.0,0.0,Per Inceptos Company,968,181,1,5.96
...,...,...,...,...,...,...,...,...,...,...,...,...
550,Nunc Ac Incorporated,99053443481,911.316,906.400,0,0.0,0.0,Nunc Ac Incorporated,997,177,2,4.81
551,Odio LLC,99100250331,74.207,65.095,0,0.0,0.0,Odio LLC,616,359,3,2.05
552,Non Enim Corp.,99115883676,86.901,87.155,0,0.0,0.0,Non Enim Corp.,348,552,2,3.65
553,Non Ante Inc.,99217762645,70.891,62.620,0,0.0,0.0,Non Ante Inc.,305,553,2,5.05


In [8]:
# removing all the repetitive columns

repetitive_clm = ['name_y', 'name_x', 'Unnamed: 0']
final_df = final_df.drop(columns=repetitive_clm)

In [9]:
mapped_industry

Unnamed: 0,name,merchant_abn,revenue_level,take_rate,industry_tags,MappedIndustry
0,Felis Limited,10023283211,e,0.18,furniture home furnishings equipment manufactu...,T
1,Arcu Ac Orci Corporation,10142254217,b,4.22,cable satellite pay television radio,K
2,Nunc Sed Company,10165489824,b,4.40,jewelry watch clock silverware,A
3,Ultricies Dignissim Lacus Foundation,10187291046,b,3.29,watch clock jewelry repair,V
4,Enim Condimentum PC,10192359162,a,6.33,music musical instruments pianos sheet music,M
...,...,...,...,...,...,...
4021,Elit Dictum Eu Ltd,99938978285,b,4.50,opticians optical goods eyeglasses,I
4022,Mollis LLP,99974311662,b,3.17,books periodicals newspapers,G
4023,Sociosqu Corp.,99976658299,a,6.57,shoe,B
4024,Commodo Hendrerit LLC,99987905597,a,6.82,motor vehicle new parts,H


In [10]:
mapped_industry = mapped_industry[[NAME, MERCHANT_ABN, MAPPED_INDUSTRIES]]

In [11]:
mapped_industry

Unnamed: 0,name,merchant_abn,MappedIndustry
0,Felis Limited,10023283211,T
1,Arcu Ac Orci Corporation,10142254217,K
2,Nunc Sed Company,10165489824,A
3,Ultricies Dignissim Lacus Foundation,10187291046,V
4,Enim Condimentum PC,10192359162,M
...,...,...,...
4021,Elit Dictum Eu Ltd,99938978285,I
4022,Mollis LLP,99974311662,G
4023,Sociosqu Corp.,99976658299,B
4024,Commodo Hendrerit LLC,99987905597,H


In [12]:
# including the industry column

final_df = final_df.merge(mapped_industry, on=MERCHANT_ABN, how=LEFT_JOIN)

In [13]:
final_df

Unnamed: 0,merchant_abn,dollar_value_avg,dollar_value_median,fraud_probability_avg,fraud_probability_median,name_x,number_of_transactions,revenue_level,take_rate,name_y,MappedIndustry
0,10206519221,37.385,25.855,0.0,0.0,Fusce Company,7476,1,6.34,Fusce Company,P
1,10462560289,35.432,23.545,0.0,0.0,Ut Consulting,1240,3,2.95,Ut Consulting,P
2,10651113986,726.269,560.410,0.0,0.0,Sed Et Libero PC,53,2,3.25,Sed Et Libero PC,X
3,10702078694,91.776,81.685,0.0,0.0,Fermentum Risus Foundation,324,1,5.95,Fermentum Risus Foundation,P
4,10901349044,105.730,92.220,0.0,0.0,Per Inceptos Company,968,1,5.96,Per Inceptos Company,P
...,...,...,...,...,...,...,...,...,...,...,...
550,99053443481,911.316,906.400,0.0,0.0,Nunc Ac Incorporated,997,2,4.81,Nunc Ac Incorporated,X
551,99100250331,74.207,65.095,0.0,0.0,Odio LLC,616,3,2.05,Odio LLC,P
552,99115883676,86.901,87.155,0.0,0.0,Non Enim Corp.,348,2,3.65,Non Enim Corp.,F
553,99217762645,70.891,62.620,0.0,0.0,Non Ante Inc.,305,2,5.05,Non Ante Inc.,F


In [14]:
final_df = final_df.drop(columns=['name_y'])

In [15]:
final_df = final_df.rename(columns={'name_x': NAME})

In [16]:
final_df['MappedIndustry'].value_counts()

MappedIndustry
F    195
P    182
X    178
Name: count, dtype: int64

In [17]:
final_df.columns

Index(['merchant_abn', 'dollar_value_avg', 'dollar_value_median',
       'fraud_probability_avg', 'fraud_probability_median', 'name',
       'number_of_transactions', 'revenue_level', 'take_rate',
       'MappedIndustry'],
      dtype='object')

In [18]:
# list of columns that will be used for the ranking of merchnats

scoring_column = ['dollar_value_avg', 'dollar_value_median',
       'fraud_probability_avg', 'fraud_probability_median',
       'number_of_transactions', 'revenue_level', 'take_rate']

In [19]:
# assuming equal weights

weight = 1/len(scoring_column)

In [20]:
# for each merchant (every row in the final_df dataframe has information about an unique merchant) 
# we calculate the score for ranking

def ranking(row: DataFrame) -> float:
    """
    Calculting the score for each merchant
    - Parameter:
        - row: dataframe with one row (information about one merchant)
    - Returns:
        - Score for each merchant
    """
    score = 0
    # function to score each merchant
    for x in scoring_column:
        score+= (weight*row[x])
    
    return score

In [21]:
rank = {}

In [22]:
# saving the score of each merchant to a dictionary

for index, row in final_df.iterrows():
    score = ranking(row)
    rank[row[MERCHANT_ABN]] = row[NAME], score, row[MAPPED_INDUSTRY]

In [23]:
sorted_rank = dict(sorted(rank.items(), key=lambda item: item[1][1], reverse=True))

In [24]:
# selecting the top 100 merchants

top_100_merchant = {key: value for key, value in list(sorted_rank.items())[:100]}


In [25]:
# checking the industries of the top 100 merchants

counts = {}

for value in top_100_merchant.values():
    industry = value[2]
    if industry in counts:
        counts[industry] += 1
    else:
        counts[industry] = 1

for item, count in counts.items():
    print(f"{item}: {count}")

X: 28
P: 32
F: 40


Convering the dictionary to a pandas dataframe, sorting to find the top 100 merchants and saving it to a csv file

In [26]:
top_100_merchant

{64203420245: ('Pede Nonummy Corp.', 28558.399, 'X'),
 49891706470: ('Non Vestibulum Industries', 27117.817, 'X'),
 45629217853: ('Lacus Consulting', 25027.77142857143, 'P'),
 89726005175: ('Est Nunc Consulting', 23631.994000000002, 'X'),
 80324045558: ('Ipsum Dolor Sit Corporation', 21589.514285714286, 'P'),
 72472909171: ('Nullam Consulting', 11197.19157142857, 'F'),
 94493496784: ('Dictum Phasellus In Institute', 10908.444285714284, 'P'),
 79417999332: ('Phasellus At Company', 10892.389142857142, 'P'),
 60956456424: ('Ultricies Dignissim LLP', 9972.707999999999, 'P'),
 32361057556: ('Orci In Consequat Corporation', 9419.490857142855, 'P'),
 91923722701: ('Euismod Urna Institute', 8580.773285714286, 'X'),
 49505931725: ('Suspendisse Ac Associates', 7504.031571428572, 'F'),
 19933438190: ('Dui Nec Corporation', 7439.500142857142, 'X'),
 96152467973: ('Rhoncus Donec Associates', 7438.046285714286, 'X'),
 57900494384: ('Porttitor Tellus Corporation', 7339.215714285713, 'X'),
 1732464599

In [27]:
top_100_merchant = pd.DataFrame.from_dict(top_100_merchant, orient='index', columns=[NAME, 'Score', MAPPED_INDUSTRY])
top_100_merchant.reset_index(inplace=True)
top_100_merchant.rename(columns={'index': MERCHANT_ABN}, inplace=True)
top_100_merchant.to_csv(PREFIX + TOP_100_MERCHANTS_PATH)

23/10/13 19:11:36 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 158629 ms exceeds timeout 120000 ms
23/10/13 19:11:36 WARN SparkContext: Killing executors is not supported by current scheduler.
23/10/13 19:12:25 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.B