![NVIDIA Logo](images/nvidia.png)

## Imports

In [1]:
import json
import ast
import random
import string 
random.seed(10)

import pandas as pd

from tqdm.notebook import tqdm

from llm_utils.models import Models
from llm_utils.nemo_service_models import NemoServiceBaseModel
from llm_utils.helpers import edit_list
from llm_utils.llm_functions import generate_list_43B as generate_list

## Address Use Case 

In [2]:
with open('data/100_seed_topics.json', 'r') as f:
    seed_topics_static = json.load(f)

seed_topics_static[:10]

['satellites',
 'hopes',
 'laptops',
 'submarines',
 'mushrooms',
 'waves',
 'dogs',
 'forks',
 'frogs',
 'wishes']

In [3]:
seed_topics_create = []

for seed in seed_topics_static:
    seed_topic_prompt = f'singular nouns with diverse theme of {seed}'
    
    seed_topics_create.append(seed_topic_prompt)

In [4]:
seed_topics_create[:10]

['singular nouns with diverse theme of satellites',
 'singular nouns with diverse theme of hopes',
 'singular nouns with diverse theme of laptops',
 'singular nouns with diverse theme of submarines',
 'singular nouns with diverse theme of mushrooms',
 'singular nouns with diverse theme of waves',
 'singular nouns with diverse theme of dogs',
 'singular nouns with diverse theme of forks',
 'singular nouns with diverse theme of frogs',
 'singular nouns with diverse theme of wishes']

In [5]:
seed_topic_variations = ['things not in nature', 
                         'technical objects', 
                         'everyday things', 
                         'non-tangible objects', 
                         'things in nature',
                         'words that start with the letter a',
                         'words that start with the letter b',
                         'words that start with the letter c']

seed_topics_create_b = []

for seed in seed_topic_variations:
    seed_topic_prompt = f'singular nouns with diverse theme of {seed}'
    
    seed_topics_create_b.append(seed_topic_prompt)

In [6]:
seed_topics_create_b

['singular nouns with diverse theme of things not in nature',
 'singular nouns with diverse theme of technical objects',
 'singular nouns with diverse theme of everyday things',
 'singular nouns with diverse theme of non-tangible objects',
 'singular nouns with diverse theme of things in nature',
 'singular nouns with diverse theme of words that start with the letter a',
 'singular nouns with diverse theme of words that start with the letter b',
 'singular nouns with diverse theme of words that start with the letter c']

### Use Seed Topics Created

In [7]:
seed_topics = []

progress_bar = tqdm(total=len(seed_topics))
while len(seed_topics) < 150:
    for s in seed_topics_create:
    
        new_seed_topics = generate_list(3, s, top_k=16, temperature=0.9, top_p=0.8)
    
        seed_topics.extend(new_seed_topics)
        seed_topics = list(set(seed_topics))
        progress_bar.update(len(seed_topics))


progress_bar.close()


0it [00:00, ?it/s]

In [8]:
print(len(list(set(seed_topics))))

print(seed_topics[:5])

273
['reefs', 'Toe', 'brolly', 'modem', 'Canon']


In [9]:
seed_topics_b = []

progress_bar = tqdm(total=len(seed_topics_b))
while len(seed_topics_b) < 150:
    for s in seed_topics_create_b:
    
        new_seed_topics = generate_list(3, s, top_k=16, temperature=0.9, top_p=0.8)
    
        seed_topics_b.extend(new_seed_topics)
        seed_topics_b = list(set(seed_topics_b))
        progress_bar.update(len(seed_topics_b))


progress_bar.close()

0it [00:00, ?it/s]

In [10]:
print(len(list(set(seed_topics_b))))

print(seed_topics_b[:5])

153
['cupcake', 'principle', 'toolbox', 'wheel', 'electric screwdriver']


### Combine Topics

In [11]:
seed_topics

['reefs',
 'Toe',
 'brolly',
 'modem',
 'Canon',
 'Maldives',
 'helicopter',
 'automobile',
 'prairie',
 'timer',
 'The Last of Us',
 'cable',
 'rucksack',
 'granite',
 'wheel',
 'a wish for peace',
 'telescope',
 'assumption',
 'locomotive',
 'lobster',
 'forks',
 'Spatial Array',
 'orchid',
 'limestone',
 'switch',
 'paintbrush',
 'sandwich',
 'haze',
 'salad bowl',
 'leaves',
 'snake',
 'field',
 'teapot',
 'apartment',
 'flower',
 'princess',
 'boat',
 'health',
 'game',
 'belief',
 'Microscope',
 'ice floe',
 'Asus ZenBook 14',
 'taste',
 'aircraft',
 'program',
 'cereal bowl',
 'tuna',
 'motorbike',
 'rice',
 'underwear',
 'array',
 'Newton',
 'cardigan',
 'socks',
 'clock',
 'townhouse',
 'spiders',
 'fur coat',
 'glacier',
 'train',
 'metal',
 'Dell XPS 13',
 'crown',
 'spotted owl',
 'router',
 'fish',
 'glass bottle',
 'chicken',
 'chair',
 'carp',
 'grass',
 'Frog',
 'education',
 'sweater',
 'bicycle',
 'Science',
 'singular noun',
 'thermal',
 'king cobra',
 'swimsuit',
 '

In [12]:
seed_topics_b

['cupcake',
 'principle',
 'toolbox',
 'wheel',
 'electric screwdriver',
 'burdens',
 'tablet',
 'bread',
 'broom',
 'owl',
 'door',
 'bug',
 'arm',
 'anchor',
 'flower',
 'chalk',
 'boat',
 'rock',
 'arrow',
 'belief',
 'ball',
 'hammer',
 'phone',
 'toilet paper',
 'joy',
 'aircraft',
 'cheese',
 'paper',
 'bald',
 'toothbrush',
 'angry',
 'pen',
 'electric car',
 'crab',
 'banana',
 'spotted owl',
 'grass',
 'router',
 'snail',
 'chicken',
 'chair',
 'bicycle',
 'couch',
 'concept',
 'burdock',
 'bat',
 'bus',
 'emotion',
 'ambulance',
 'chimpanzee',
 'crocodile',
 'barge',
 'data',
 'mind',
 'straw',
 'furniture',
 'watch',
 'sea',
 'cookie',
 'motorcycle',
 'mountain',
 'love',
 'satellite',
 'tulip',
 'beacon',
 'burdensomely',
 'computer',
 'window',
 'chrysanthemum',
 'machine',
 'bar',
 'experience',
 'hair dryer',
 'car',
 'money',
 'spirit',
 'cello',
 'cloud',
 'bird',
 'burdensome',
 'chilli',
 'smartphone',
 'acrobat',
 'bear',
 'book',
 'hat',
 'branch',
 'memory',
 'bea

In [13]:
def generateAddy(word):
    
    num = random.randrange(1,15000)
    
    streetList = ['parkway',
        'avenue',
        'boulevard',
        'lane',
        'highway',
        'road',
        'street',
        'terrace',
        'place',
        'center',
        'square',
        'way']
    
    st = random.choice(streetList)
    
    return f'{num} {word} {st}'    

In [14]:
generateAddy('berry')

'9362 berry parkway'

In [15]:
l = 'lobster'
percentage = 3 # a third 
    
addy = generateAddy(l)

if random.choice([True, False]): 
    # Create misspelling
    streetName = addy.split(' ')[1]
    positions = round(len(streetName)/percentage)
    
    #print(positions)

    # randomly select postions 
    misspell = list(l)
    if random.choice([True, False]):
        # remove character
        for i in range(positions):
            #print(misspell)
            misspell.pop(random.randrange(1, len(misspell)))

        badAddy = f"{addy.split(' ')[0]} {''.join(misspell)} {addy.split(' ')[2]}"
        print(addy, badAddy, 1)
    else:
        for i in range(positions):
            #print(misspell)
            misspell.insert(random.randrange(1, len(misspell)), random.choice(string.ascii_letters.lower()))
        
        badAddy = f"{addy.split(' ')[0]} {''.join(misspell)} {addy.split(' ')[2]}"
        print(addy, badAddy, 1)
    
else:
    # create similar/but differnt address
    streetName = addy.split(' ')[1]
    
    _prompt = f'Word that sounds like, looks like or could be confused with {streetName}'
    
    _diffStreet = generate_list(1, _prompt, top_k=16, temperature=0.9, top_p=0.8)

    print(_diffStreet[0])
    
    differntAddy = generateAddy( _diffStreet[0])
 
    print(addy, differntAddy, 0)

7027 lobster terrace 7027 lobsr terrace 1


In [16]:
## implement as function interate through seed list 

def syntheticTrainingdata(streetName, percentage=3):

    addy = generateAddy(streetName)
    
    if random.choice([True, False]): 
        # Create misspelling
        streetName = addy.split(' ')[1]
        positions = round(len(streetName)/percentage)
        
        #print(positions)
    
        # randomly select postions 
        misspell = list(streetName)
        if random.choice([True, False]):
            # remove character
            for i in range(positions):
                #print(misspell)
                misspell.pop(random.randrange(1, len(misspell)))
    
            badAddy = f"{addy.split(' ')[0]} {''.join(misspell)} {addy.split(' ')[2]}"
            return(addy, badAddy, 1)
        
        else:
            for i in range(positions):
                #print(misspell)
                misspell.insert(random.randrange(1, len(misspell)), random.choice(string.ascii_letters.lower()))
            
            badAddy = f"{addy.split(' ')[0]} {''.join(misspell)} {addy.split(' ')[2]}"
            return (addy, badAddy, 1)
        
    else:
        # create similar/but differnt address
        streetName = addy.split(' ')[1]
        
        _prompt = f'Word that sounds like, looks like or could be confused with {streetName}'

        try:
        
            _diffStreet = generate_list(1, _prompt, top_k=16, temperature=0.9, top_p=0.8)
        
            #print(_diffStreet[0])
            
            differntAddy = generateAddy( _diffStreet[0])

            return (addy, differntAddy, 0)
        
        except:

            differntAddy = generateAddy('main')

            return (addy, differntAddy, 0)
       

In [17]:
stNames = seed_topics_b + seed_topics_b
print(len(stNames))

stNames[:10]

306


['cupcake',
 'principle',
 'toolbox',
 'wheel',
 'electric screwdriver',
 'burdens',
 'tablet',
 'bread',
 'broom',
 'owl']

In [45]:
# 0 ~ different addresses that look similar but should not be matched
# 1 ~ same addresses with some minimal typo that occured with same house number, street that should be fuzzy matched

for st in stNames[:10]:

    row =  syntheticTrainingdata(st , percentage=3)

    print(row)

('12892 abort lane', '2991 abort avenue', 0)
('9691 machine lane', '3404 main avenue', 0)
('11213 leaf highway', '14143 main way', 0)
('12312 coral reef way', '7334 for square', 0)
('12181 tree place', '12181 trexe place', 1)
('1900 hope square', '1900 hoe square', 1)
('9991 fruit street', '14276 fruit square', 0)
('3273 house avenue', '13061 horse avenue', 0)
('13499 knife parkway', '4173 main road', 0)
('7486 breeze street', '7486 bregesze street', 1)


In [33]:
import pandas as pd
# put into dataframe
df = pd.DataFrame()

for i, st in enumerate(stNames):
    res = {}
    row =  syntheticTrainingdata(st , percentage=3)
    
    res['good address'] = row[0]
    res['compare address'] = row[1]
    res['label'] = row[2]

    dfr = pd.DataFrame(res, index=[i])

    df = pd.concat([df, dfr])

In [37]:
df[:15]

Unnamed: 0,good address,compare address,label
0,14361 cupcake terrace,14361 cupofcake terrace,1
1,13823 principle terrace,2689 principal highway,0
2,908 toolbox street,1209 main highway,0
3,6785 wheel avenue,1809 main way,0
4,3944 electric screwdriver road,3944 elecc screwdriver,1
5,1791 burdens square,3228 burden place,0
6,10569 tablet highway,10569 tmablevt highway,1
7,5840 bread highway,3370 bread highway,0
8,753 broom street,753 blnroom street,1
9,12423 owl road,8294 main square,0


In [35]:
df.to_csv('/workspace/dli/3-Synthetic-Data-Generation/data.csv')

In [36]:
df.shape

(306, 3)