In [15]:
import json

In [16]:
with open("stickers.json", 'r') as f:
    stickers_main = json.load(f)
print(stickers_main[0:10])

stickers = list(filter(lambda x: (x.get("matching", False) or (
    'autographed' in x['description'].lower())), stickers_main))

[{'id': 'sticker-1', 'name': 'Sticker | Shooter', 'description': 'This sticker can be applied to any weapon you own and can be scraped to look more worn. You can scrape the same sticker multiple times, making it a bit more worn each time, until it is removed from the weapon.', 'rarity': {'id': 'rarity_default', 'name': 'Default', 'color': '#ded6cc'}, 'crates': [], 'image': 'https://steamcdn-a.akamaihd.net/apps/730/icons/econ/stickers/dreamhack/dh_gologo1_large.9cb84d29f38fe347f001e7057a188696bda6f67b.png'}, {'id': 'sticker-2', 'name': 'Sticker | Shooter (Foil)', 'description': 'This sticker can be applied to any weapon you own and can be scraped to look more worn. You can scrape the same sticker multiple times, making it a bit more worn each time, until it is removed from the weapon.', 'rarity': {'id': 'rarity_default', 'name': 'Default', 'color': '#ded6cc'}, 'crates': [], 'image': 'https://steamcdn-a.akamaihd.net/apps/730/icons/econ/stickers/dreamhack/dh_gologo1_holo_large.b36272bc491

In [18]:
def mutate_sticker(sticker: dict):
    if (sticker.get("matching", None) == None):
        main_part: str = sticker["name"].split(" | ")[1]

        for tag in ["(Foil)", "(Gold)", "(Holo)", "(Holo, Champion)", "(Glitter)", "(Glitter, Champion)", "(Gold, Champion)", "(Champion)", "(Golden)"]:
            main_part = main_part.replace(tag, "")

        main_part = main_part.strip().lower().replace(
            "$", "s").replace("1", "i").replace("3", "e")

        sticker["matching"] = main_part
        del sticker['description']
        del sticker['crates']
        del sticker['rarity']
    return sticker


stickers = [mutate_sticker(sticker) for sticker in stickers]

print(stickers[532])

{'id': 'sticker-895', 'name': 'Sticker | rallen (Gold) | Cluj-Napoca 2015', 'image': 'https://steamcdn-a.akamaihd.net/apps/730/icons/econ/stickers/cluj2015/sig_rallen_gold_large.2f8ae0c7eaa713a0c2737c5d595e13709c561aa1.png', 'matching': 'rallen'}


In [19]:

all_full_words = list(set([sticker["matching"] for sticker in stickers]))

In [20]:
all_full_words[:10]

['n0rber7',
 'maniac',
 'axile',
 'nin9',
 'smooya',
 'aerial',
 'stewie2k',
 'patsi',
 'jackz',
 'fashr']

In [22]:
# Want to Link text back to a list of sticker objects with that text
stickers_by_matched_full_word = {}
for matched_word in all_full_words:
    stickers_with_that_pattern = []
    for sticker in stickers:
        if sticker["matching"] == matched_word:
            stickers_with_that_pattern.append(sticker)
    stickers_by_matched_full_word[matched_word] = stickers_with_that_pattern

In [23]:
# Want to Link text back to a list of sticker objects with that text
sticker_ids_by_matched_full_word = {}
for matched_word in all_full_words:
    stickers_with_that_pattern = []
    for sticker in stickers:
        if sticker["matching"] == matched_word:
            stickers_with_that_pattern.append(sticker["id"])
    sticker_ids_by_matched_full_word[matched_word] = stickers_with_that_pattern

In [24]:
len(stickers_by_matched_full_word["scream"])

18

In [25]:
def group_letters(string):

    result = []

    def backtrack(start, current, result):
        if start == len(string):
            result.append(current)
            return
        for end in range(start + 1, len(string) + 1):
            backtrack(end, current + [string[start:end]], result)

    backtrack(0, [], result)
    return result

In [26]:
class Token:
    start = 'START'
    mid = "MID"
    end = "END"
    total = "TOTAL"

    def __init__(self, text, full, idx, loc=None) -> None:
        """
        text is the part of the sticker work we are using ie the token

        full is the text of the full sticker and is used to determine loc with the idx if loc not given

        idx is the index of the first letter that the text(aka token) uses in the full text string

        loc specifies if the specified token is start mid end or total
        """
        self.text = text
        self.full = full
        if loc == None:
            if idx == 0 and idx == len(full)-1:
                self.loc = Token.total
            elif idx == 0:
                self.loc = Token.start
            elif idx == len(full)-1:
                self.loc = Token.end
            else:
                self.loc = Token.mid
        else:
            self.loc = loc

    def __repr__(self) -> str:

        return f"<Token:{self.loc}:{self.text}>"

    def __eq__(self, __value: object) -> bool:
        return (self.text == __value.text) and (self.loc == __value.loc)

    def __hash__(self) -> int:
        return hash(F"{self.text}&&{self.loc}")


def flatten_concatenation(matrix):
    flat_list = []
    for row in matrix:
        flat_list += row
    return flat_list


def tokenize_all_stickers():

    word_tokens = {}
    for word in all_full_words:

        r = group_letters(word)
        word_tokens[word] = word_tokens.get(word, [])
        word_tokens[word].append(r)

    """
    {
    ...,
    'scream' : [[s,c,r,e,a,m],[s,c,r,e,am],...],
    ...
    }
    """

    # this is just squeezing out  the 1-D first dimension
    word_tokens = {word: lst[0] for word, lst in word_tokens.items()}

    word_tokens = {word: [[Token(text, tokenization, i) for i, text in enumerate(
        tokenization)] for tokenization in lst] for word, lst in word_tokens.items()}

    word_tokens = {word: list(set(flatten_concatenation(lst_2d)))
                   for word, lst_2d in word_tokens.items()}

    return word_tokens


def invert_dictionary(original_dict):
    inverted_dict = {}
    for key, lst in original_dict.items():

        for element in lst:
            if element not in inverted_dict:
                inverted_dict[element] = [key]
            else:
                if key not in inverted_dict[element]:
                    inverted_dict[element].append(key)

    return inverted_dict


def check_viability(prev_token, potential_next_token):
    if potential_next_token.loc == Token.total:
        return (
            (prev_token.loc == Token.start) or
            (prev_token.loc == Token.end) or
            (prev_token.loc == Token.mid) or
            (prev_token.loc == Token.total)
        )
    if potential_next_token.loc == Token.mid:
        return (
            (prev_token.loc == Token.end) or
            (prev_token.loc == Token.total)
        )
    if potential_next_token.loc == Token.start:
        return (
            (prev_token.loc == Token.start) or
            (prev_token.loc == Token.end) or
            (prev_token.loc == Token.mid) or
            (prev_token.loc == Token.total)
        )
    if potential_next_token.loc == Token.end:
        return (
            (prev_token.loc == Token.end) or
            (prev_token.loc == Token.total)
        )

In [27]:
tokens_by_matched_full_word = tokenize_all_stickers()

In [28]:
inverted_dict = invert_dictionary(tokens_by_matched_full_word)

### Testing Inverted Dict


In [29]:
inverted_dict[Token("fn", None, None, Token.start)]

['fns', 'fnatic', 'fnx']

## Saving Inverted Dict and Stickers By Full Matched Word


In [30]:
with open('stickers_by_matched_full_word.json','w') as f:
    json.dump({key:value for key,value in stickers_by_matched_full_word.items()},f)
    
with open('sticker_ids_by_matched_full_word.json','w') as f:
    json.dump({key:value for key,value in sticker_ids_by_matched_full_word.items()},f)

with open('inverted_dict.json','w') as f:
    json.dump({str(key):value for key,value in inverted_dict.items()},f)



## Bulding Tokenizer Map from smaller files


In [31]:
with open('stickers_by_matched_full_word.json','r') as f:
    stickers_by_matched_full_word = json.load(f)
with open('sticker_ids_by_matched_full_word.json','r') as f:
    sticker_ids_by_matched_full_word = json.load(f)
with open('inverted_dict.json','r') as f:
    inverted_dict = json.load(f)

In [32]:
from time import perf_counter
 

In [33]:
t1_start = perf_counter() 
tokenizer_map = {token: [sticker_ids_by_matched_full_word[i] for i in formatted_names]
             for token, formatted_names in inverted_dict.items()}



tokenizer_map = {key: flatten_concatenation(
    value) for key, value in tokenizer_map.items()}
t1_stop = perf_counter()
print("Elapsed time during the whole program in seconds:",
                                        t1_stop-t1_start)

Elapsed time during the whole program in seconds: 0.006518399997730739


In [34]:
tokenizer_map.keys()

dict_keys(['<Token:END:7>', '<Token:TOTAL:n0rber7>', '<Token:START:n0>', '<Token:MID:0>', '<Token:START:n>', '<Token:END:er7>', '<Token:START:n0rber>', '<Token:MID:0r>', '<Token:END:rber7>', '<Token:MID:0rb>', '<Token:START:n0rbe>', '<Token:MID:0rbe>', '<Token:MID:rbe>', '<Token:MID:ber>', '<Token:START:n0rb>', '<Token:MID:er>', '<Token:MID:rber>', '<Token:END:r7>', '<Token:MID:b>', '<Token:END:ber7>', '<Token:MID:be>', '<Token:MID:r>', '<Token:MID:0rber>', '<Token:END:0rber7>', '<Token:MID:rb>', '<Token:START:n0r>', '<Token:MID:e>', '<Token:END:niac>', '<Token:MID:an>', '<Token:START:m>', '<Token:END:aniac>', '<Token:MID:i>', '<Token:TOTAL:maniac>', '<Token:MID:a>', '<Token:MID:ani>', '<Token:MID:ia>', '<Token:START:mani>', '<Token:MID:ania>', '<Token:MID:ni>', '<Token:START:ma>', '<Token:END:iac>', '<Token:MID:n>', '<Token:START:man>', '<Token:MID:nia>', '<Token:START:mania>', '<Token:END:c>', '<Token:END:ac>', '<Token:END:ile>', '<Token:MID:xil>', '<Token:START:axil>', '<Token:TOTAL

In [35]:
from typing import List
from itertools import product
import numpy as np
from pprint import pprint


def with_other(parse_tokens: List[str]):

    def aplit(parse: str):
        token_possibilities = []
        for token_loc in (Token.start, Token.end, Token.mid, Token.total):
            token_possibilities.append(Token(parse, None, None, token_loc))
        return token_possibilities
    all_possible_for_each_tok = []
    for token in parse_tokens:
        all_possible_for_each_tok.append(aplit(token))
    return all_possible_for_each_tok


def stickerfy_word(word: str):

    groupings = group_letters(word)
    groupings = [combo for combo in groupings if len(combo) <= 5]

    results = []
    for group in groupings:
        groupingWithLoc = with_other(group)

        # Create all possible combinations of one element from each sublist
        allGroupingPermutations = list(product(*groupingWithLoc))

        idx_to_keep = []
        for combo_idx, combo in enumerate(allGroupingPermutations):
            status = True
            if len(combo) > 5:
                continue
            for lst, t in enumerate(combo):
                if lst-1 >= 0:
                    current_token = t
                    last_token = combo[lst-1]

                    if not check_viability(last_token, current_token):
                        status = False
                        break
            if status:
                idx_to_keep.append(combo_idx)

        # idx_to_keep = list(set(idx_to_keep))

        allGroupingPermutations = np.array(
            allGroupingPermutations, dtype=object)
        filtered_all_combinations = allGroupingPermutations[idx_to_keep]

        aggregated_list = []
        combo_set = []
        for combo in filtered_all_combinations:

            stickers_matching_tokens = []
            status = True
            for positionalToken in combo:
                # check if token exists
                list_of_players_match = tokenizer_map.get(positionalToken, None)

                if list_of_players_match != None:
                    stickers_matching_tokens.append(list_of_players_match)

                else:
                    status = False

            if status:
                stickers_needed = len(combo)

                if (combo[0].loc is Token.end) or (combo[0].loc is Token.mid):
                    stickers_needed += 1
                if (combo[-1].loc is Token.start) or (combo[-1].loc is Token.mid):
                    stickers_needed += 1

                if stickers_needed > 5:
                    continue
                combo_set.append(combo)
                aggregated_list.append(stickers_matching_tokens)

        if (len(combo_set) > 0):

            for k, comb in enumerate(combo_set):

                possible_text_split = []
                for lst, j in zip(aggregated_list[k], comb):

                    matched_part = j.text
                    matched_loc = j.loc

                    matching_stickers = lst

                    sticker_objects_to_append = []

                    sticker_token_to_add = {
                        "matchedPart": matched_part,
                        "matchedLoc": matched_loc,
                        "stickers": matching_stickers
                    }

                    possible_text_split.append(sticker_token_to_add)

                results.append(possible_text_split)
    return results

In [19]:
tokenizer_map[Token("fn", None, None, Token.start)].__len__()

60

# Spell Out `word`


In [20]:
word = "apple"
res = stickerfy_word(word)

### Make small sticker_id --> sticker map


In [41]:
stickers_by_id = {}
for sticker in stickers_main:
    sticker_copy = dict(sticker)

    sticker_copy.pop('rarity',None)

    sticker_copy.pop('description',None)
    sticker_copy.pop('crates',None)

    stickers_by_id[sticker_copy['id']] = sticker_copy

In [43]:
with open('stickers_by_id.json','w') as f:
    json.dump(stickers_by_id,f)