In [1]:
import json

## Create list of synthetic data, where each item is a tuple like (full_title, shortened name)


In [2]:
#### Process data to just take first three characters of every word for a given product item
def first_chars_rule(source):
    with open(source) as json_file:  
        data = json.load(json_file)
        three_letter_shortening = []
        for full_title in data:
            full_title = full_title.upper()
            full_title = full_title.split()
            shortened_name = []
            for token in full_title:
                if len(token)>3:
                    shortened_name.append(token[:3])
                else:
                    shortened_name.append(token)
            three_letter_shortening.append(' '.join(shortened_name))

    ret_list = []
    for i, title in enumerate(data):
        ret_list.append((title, three_letter_shortening[i]))
    return ret_list

In [3]:
#### Process data to take first and last characters of every word for a given product item
def first_last_char_rule(source):
    with open(source) as json_file:  
        data = json.load(json_file)
        shorten = []
        for full_title in data:
            full_title = full_title.upper()
            full_title = full_title.split()
            shortened_name = []
            for token in full_title:
                if len(token)>2:
                    shortened_name.append(token[0] + token[-1])
                else:
                    shortened_name.append(token)
            shorten.append(' '.join(shortened_name))

    ret_list = []
    for i, title in enumerate(data):
        ret_list.append((title, shorten[i]))
    return ret_list

In [4]:
#### Process data to keep the first three consonants of every word for a given product item
def first_consonants(source):
    with open(source) as json_file:  
        data = json.load(json_file)
        shorten = []
        for full_title in data:
            full_title = full_title.upper()
            full_title = full_title.split()
            shortened_name = []
            for token in full_title:
                table = str.maketrans(dict.fromkeys('AEIOU'))
                token = token.translate(table)
                if len(token)>3:
                    shortened_name.append(token[:3])
                else:
                    shortened_name.append(token)
            shorten.append(' '.join(shortened_name))

    ret_list = []
    for i, title in enumerate(data):
        ret_list.append((title, shorten[i]))
    return ret_list

In [5]:
#### Process data to keep the first two and last consonants of every word for a given product item
def first_last_consonants(source):
    with open(source) as json_file:  
        data = json.load(json_file)
        shorten = []
        for full_title in data:
            full_title = full_title.upper()
            full_title = full_title.split()
            shortened_name = []
            for token in full_title:
                table = str.maketrans(dict.fromkeys('AEIOU'))
                token = token.translate(table)
                if len(token)>3:
                    shortened_name.append(token[:2] + token[-1])
                else:
                    shortened_name.append(token)
            shorten.append(' '.join(shortened_name))
    ret_list = []
    for i, title in enumerate(data):
        ret_list.append((title, shorten[i]))
    return ret_list


In [6]:
#### Process data to get first letter of each word in brand
def brand_first_letters(source):
    with open(source) as json_file:
        data = json.load(json_file)
        mapped_data = []
        for full_title in data:
            try:
                orig_title = full_title
                full_title = full_title.upper()
                full_title = full_title.split(",")
                name = ""
                if len(full_title) == 2:
                    brand = full_title[0].split(" ")
                    if len(brand) > 1:
                        for eachWord in brand:
                            name+=str(eachWord[0])
                        name+=str(full_title[1])
                    else:
                        name=str(full_title[0])+str(full_title[1])
                else:
                    name=str(full_title[0])
                mapped_data.append((orig_title.replace(",", ""), name.replace(",", "")))
            except:
                pass
    return mapped_data 

In [7]:
#### Process data to get first letter of each word
#### This is used for brands using brands.json during feature engineering for gbdt
def first_letters(source):
    with open(source) as json_file:
        data = json.load(json_file)
        shorten = []
        for x in data:
            if len(x.split()) > 1:
                words = x.split()
                shortened = ""
                for word in words:
                    shortened += word[0]
                shorten.append(shortened)
            else:
                shorten.append(x)
        ret_list = []
        for i, title in enumerate(data):
            ret_list.append((title, shorten[i]))
        return ret_list

In [8]:
#### Process data to remove all vowels
def remove_vowels(source):
    with open(source) as json_file:
        data = json.load(json_file)
        shorten = []
        for full_title in data:
            full_title = full_title.upper()
            full_title = full_title.split()
            shortened_name = []
            vowels = ['A', 'I', 'O', 'U', 'E']
            full_name = ""
            for token in full_title:
                shortened_word = ""
                for character in token:
                    full_name += character
                    if character not in vowels:
                        shortened_word += character
                shortened_name.append(shortened_word)
            shorten.append(" ".join(shortened_name))
    ret_list = []
    for i, title in enumerate(data):
        ret_list.append((title, shorten[i]))
    return ret_list

# Generate Finalized Data

In [9]:
final_data = []
final_data+=remove_vowels('../data/cleaned_data.json')
print (len(final_data))
# final_data+=brand_first_letters('../data/cleaned_branded_data.json')
# print (len(final_data))
final_data+=first_last_consonants('../data/cleaned_data.json')
print (len(final_data))
final_data+=first_consonants('../data/cleaned_data.json')
print (len(final_data))
final_data+=first_last_char_rule('../data/cleaned_data.json')
print (len(final_data))
final_data+=first_chars_rule('../data/cleaned_data.json')
print (len(final_data))

105730
211460
317190
422920
528650


In [10]:
with open('../data/final_train_labels.json', 'w') as outfile:
    json.dump(final_data, outfile)

In [22]:
abbreviated_brands = []
abbreviated_brands+=remove_vowels('../data/brands.json')
print (len(abbreviated_brands))
abbreviated_brands+=first_letters('../data/brands.json')
print (len(abbreviated_brands))
abbreviated_brands+=first_last_consonants('../data/brands.json')
print (len(abbreviated_brands))
abbreviated_brands+=first_consonants('../data/brands.json')
print (len(abbreviated_brands))
abbreviated_brands+=first_last_char_rule('../data/brands.json')
print (len(abbreviated_brands))
abbreviated_brands+=first_chars_rule('../data/brands.json')
print (len(abbreviated_brands))

6598
13196
19794
26392
32990
39588


In [23]:
with open('../data/abbreviated_brands.json', 'w') as outfile:
    json.dump(abbreviated_brands, outfile)

In [13]:
abbreviated_descriptors = []
abbreviated_descriptors+=remove_vowels('../data/descriptors.json')
print (len(abbreviated_descriptors))
abbreviated_descriptors+=first_last_consonants('../data/descriptors.json')
print (len(abbreviated_descriptors))
abbreviated_descriptors+=first_consonants('../data/descriptors.json')
print (len(abbreviated_descriptors))
abbreviated_descriptors+=first_last_char_rule('../data/descriptors.json')
print (len(abbreviated_descriptors))
abbreviated_descriptors+=first_chars_rule('../data/descriptors.json')
print (len(abbreviated_descriptors))

18
36
54
72
90


In [14]:
with open('../data/abbreviated_descriptors.json', 'w') as outfile:
    json.dump(abbreviated_descriptors, outfile)

In [15]:
final_branded_train_labels = []
final_branded_train_labels+=remove_vowels('../data/cleaned_branded_data_no_commas.json')
print (len(final_branded_train_labels))
final_data+=brand_first_letters('../data/cleaned_branded_data.json')
print (len(final_data))
final_branded_train_labels+=first_last_consonants('../data/cleaned_branded_data_no_commas.json')
print (len(final_branded_train_labels))
final_branded_train_labels+=first_consonants('../data/cleaned_branded_data_no_commas.json')
print (len(final_branded_train_labels))
final_branded_train_labels+=first_last_char_rule('../data/cleaned_branded_data_no_commas.json')
print (len(final_branded_train_labels))
final_branded_train_labels+=first_chars_rule('../data/cleaned_branded_data_no_commas.json')
print (len(final_branded_train_labels))

140106
668731
280212
420318
560424
700530


In [16]:
with open('../data/final_branded_train_labels.json', 'w') as outfile:
    json.dump(final_branded_train_labels, outfile)