# Description of algorithm

### Data preparation

Loads data set and filteres out sample that do not have a category_name. Then it replaces each NaN in brand_name column by 'No Brand'.

### Prestige

The algorithm first creates four dictionaries for storing brand occurences (absolute), brand occurences by category, votes and prestigue, where brand names are the keyes and the values are initialised to 0.
The algorithm retrieves unique categories from category_name columns and loops through them. For each category, it performs the following:
* Loops through all the brands in that category and for every brand it obtains a mean price (across all the items in that category and brand)
* Computes median from the brand mean prices
* Loops through all the brands in that category again and peforms the following:
    * Adds 1 to the brand votes dictionary if the mean for the given brand is more then threshold*median (current treshold is 1.5)
    * Adds 1 to the brand occurences by category dict for the given brand

Brand prestige is then computed as (number of votes) / (number of occurences by category), where these values are contained in the brand votes and brand occurences dictionaries.

### Prestige reliability

Prestige reliability is number of absolute occurences of a brand divided by maximum number of occurences of any brand (not including 'No Brand')

# Settings

In [None]:
settings = {
            'del': True, # Delete variables that are no longer needed to proceed in computations to save place
            'filename_str': 'prestige.csv', # File for saving the new dataset
           }

# Load data set

In [None]:
import numpy as np
import pandas as pd

In [None]:
PATH = "../../data/"
data_full = pd.read_csv(f'{PATH}train.tsv', sep='\t')

In [None]:
data_full.head()

# Filter out samples that does not have category_name

In [None]:
data_reduced = data_full.loc[data_full['category_name'].notnull()]

if (settings['del']):
    del data_full

In [None]:
# double check
print("%d items have no category_name" % data_reduced['category_name'].isna().sum())

# Replace NaN in brand_name by 'No Brand'

In [None]:
data_reduced['brand_name'] = data_reduced['brand_name'].fillna('No Brand')

In [None]:
data_reduced.head()

In [None]:
unique_cns = data_reduced.category_name.unique() # array of unique category names
unique_brands = data_reduced.brand_name.unique() # array of unique brand names

# Generate dictionaries with brand occurences, votes and prestige

In [None]:
# every time a brand occurs in a given category,its cat occurance will be increased by one (loops through categories)
brand_occurences_cat = dict(zip(unique_brands, len(unique_brands)*[0])) # {brand_name: occurence}

# every time a brand occurs, its occurence will be increased by 1
brand_occurences = dict(zip(unique_brands, len(unique_brands)*[0])) # {brand_name: occurence}

# every time a brand is determined as prestigious, increase its vote by one, otherwise don't do anything
brand_votes = dict(zip(unique_brands, len(unique_brands)*[0])) # {brand_name: vote}, initialised to zeros

brand_prestige = {} # {brand_name: prestigue} will hold prestigue for every brand, where prestigue is number of votes divided by number of occurences

from tqdm import tqdm_notebook
for cat_name in tqdm_notebook(unique_cns): # iterate through all categories
    
    data_filtered_cn = data_reduced.loc[data_reduced.category_name == cat_name] # get data subset for the given category
    brands = data_filtered_cn.brand_name.unique() # array of unique brand names for the given category
    
    # Get mean for each brand in the given category
    brand_means = {} # {brand_name: mean} dictionary to store mean price for every brand in the given category
    for b in brands:
        data_brand = data_filtered_cn.loc[data_filtered_cn['brand_name'] == b] # data frame containing only one specific brand for one category
        brand_means[b] = data_brand.price.mean()
        brand_occurences[b] = brand_occurences[b] + len(data_brand)
    
    # Increase vote by 1 for presitgious brands, otherwise keep current vote
    
    def vote(val, treshold):
        if (val >= treshold):
            return 1
        return 0
    
    treshold = 1.5 * np.median(list(brand_means.values())) # 1.5 * (median of the brands means)
    
    for brand in brands:
        # vote
        votes_so_far = brand_votes[brand]
        new_vote = vote(brand_means[brand], treshold)
        brand_votes[brand] = votes_so_far + new_vote
        
        # add occurence for category
        brand_occurences_cat[brand] = brand_occurences_cat[brand] + 1
    
for brand in unique_brands:
    brand_prestige[brand] = brand_votes[brand] / brand_occurences_cat[brand]
    

In [None]:
# brand_votes

In [None]:
# brand_occurences

In [None]:
# brand_prestige

# Parse the dictionaries into DataFrame

In [None]:
# parse votes, occurences and prestige dictionaries for pandas

parsed_brands = {'brand_name': [], 'votes':[], 'occurences':[], 'prestige':[], 'prestige_reliability': []}

# get number of occurences for the second most occurent brand (the first one is going to be 'No Brand'
# with radically large number)
brand_occurences_vals = list(brand_occurences.values())
brand_occurences_vals.sort(reverse=True)
max_occurence = brand_occurences_vals[1]

for brand in unique_brands:
    parsed_brands['brand_name'].append(brand)
    parsed_brands['votes'].append(brand_votes[brand])
    parsed_brands['occurences'].append(brand_occurences[brand])
    parsed_brands['prestige'].append(brand_prestige[brand])
    parsed_brands['prestige_reliability'].append(brand_occurences[brand] / max_occurence) # it is scaled occurence

In [None]:
prestige_df = pd.DataFrame.from_dict(parsed_brands)

## Set prestige_reliability for 'No Brand' to 1 (otherwise its value is too large because scaling was done using the most occurent brand that is not 'No Brand')

In [None]:
no_brand_index = prestige_df.loc[prestige_df['brand_name'] == 'No Brand'].index
prestige_df.at[no_brand_index, 'prestige_reliability'] = 1

In [None]:
prestige_df.head()

# Insert prestige and prestige_reliability columns to the data_reduced DataFrame (parses the prestige_df DataFrame)

In [None]:
data_reduced.insert(loc=len(data_reduced.columns), column='prestige', value=np.nan)
data_reduced.insert(loc=len(data_reduced.columns), column='prestige_reliability', value=np.nan)
for index, row in tqdm_notebook(data_reduced.iterrows()):
    brand_info = prestige_df.loc[prestige_df['brand_name'] == row['brand_name']]
    data_reduced.at[index, 'prestige'] = brand_info['prestige']
    data_reduced.at[index, 'prestige_reliability'] = brand_info['prestige_reliability']

In [None]:
data_reduced.to_csv(settings['filename_str'])
print("Data saved to:", settings['filename_str'])