# Tokenization


In [67]:
# create functions for generating and storing tokens
import random

class TokenDict:
    """Class for storing randomly generated tokens
    
        Typical usage example:
            token_dict = TokenDict()
    
    """
    def __init__(self):
        # protected attributes 
        self.__token_dict = {}
        
    def generate_token(self) -> str:
        NUMBERS = "0123456789"
        chars=[]
        for i in range(16):
            chars.append(random.choice(NUMBERS))

        token = "".join(chars)
        return token

    def tokenize_value(self, real_value:str) -> str:
        if self.retrieve_token(real_value) is not None:
            # print(self.__token_dict)
            # raise ValueError('value is already in token dictionary') # -> if you wish to intercept created value
            return self.retrieve_token(real_value)
        token=self.generate_token()
        while token in self.__token_dict.keys() or token==real_value:
            token = self.generate_token()
        self.__token_dict[token]=real_value
        return token
    
    def retrieve_value(self, token:str) -> str:
        return self.__token_dict[token]
    
    def retrieve_token(self, real_value:str) -> str:
        for k,v in self.__token_dict.items():
            if v == real_value:
                return k
        return None



### Initialise token dictionary class and enter values into the dictionary

In [5]:
token_dict_1 = TokenDict()

token_1 = token_dict_1.tokenize_value('John')
print('First Token: {}'.format(token_1))
token_2 = token_dict_1.tokenize_value('Jake')
print('Second Token: {}'.format(token_2))


First Token: 5860634447576034
Second Token: 2219624076008974


- An example of what you have executed

| Value       | Token           |
| ----------- | ----------------|
| John        | 0538130538135341|
| Jake        | 3489397003014856|

### Retrieve token from value and vice versa

In [8]:
print('Retrieve second value from second token: {}'.format(token_dict_1.retrieve_value(token_2)))
print('Retrieve first token from first value: {}'.format(token_dict_1.retrieve_token('John')))

Retrieve second value from second token: Jake
Retrieve first token from first value: 5860634447576034


## Exercise

#### Get data from data.gov.sg

Note: to perform the download, you need to unrestrict the download rate limit for jupyter notebook by re-opening it with command:

`jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10`

In [36]:
import requests
import io
import pandas as pd

api_response = requests.get('https://data.gov.sg/api/action/package_show?id=acra-information-on-corporate-entities'
                           ).json()

resources = api_response['result']['resources']

data_set_url = ''

for resource in resources:
    if resource['name'] == 'ACRA Information on Corporate Entities (\'X\')':
        data_set_url = resource['url']
        print('Data set URL:\n{}'.format(data_set_url))
        
acra_csv_data = io.StringIO(requests.get(data_set_url).content.decode('utf-8'))

acra_csv_df = pd.read_csv(acra_csv_data, dtype=str)

acra_csv_df.head()

Data set URL:
https://storage.data.gov.sg/acra-information-on-corporate-entities/resources/acra-information-on-corporate-entities-x-2021-12-13T02-37-28Z.csv


Unnamed: 0,business_constitution_description,primary_ssic_description,primary_user_described_activity,street_name,entity_status_description,annual_return_date,postal_code,paid_up_capital10_preference,entity_name,paid_up_capital2_others,...,paid_up_capital10_ordinary,paid_up_capital10_others,uen_of_audit_firm4,paid_up_capital7_others,uen_of_audit_firm5,entity_type_description,paid_up_capital3_ordinary,paid_up_capital5_ordinary,paid_up_capital9_preference,paid_up_capital1_ordinary
0,Sole Proprietor,PUBS,na,CIRCULAR ROAD,Cancelled,na,49392,na,X,na,...,na,na,na,na,na,Business,na,na,na,na
1,Partnership,WHOLESALE TRADE OF A VARIETY OF GOODS WITHOUT ...,na,SENJA ROAD,na,na,670616,na,X & B TRADING,na,...,na,na,na,na,na,Business,na,na,na,na
2,na,CAFES AND COFFEE HOUSES,SNACKS AND BEVERAGE KIOSK OR SHOP,ROBINSON ROAD,Struck Off,2013-09-25T18:54:24,48545,na,X & E PASSION PTE. LTD.,na,...,na,na,na,na,na,Local Company,na,na,na,100000
3,Sole Proprietor,RENTAL AND LEASING OF CARS WITH DRIVER (EXCLUD...,na,CHOA CHU KANG CRESCENT,Cancelled (Non-Renewal),na,682691,na,X & F SERVICES,na,...,na,na,na,na,na,Business,na,na,na,na
4,na,OTHER HOLDING COMPANIES,INVESTMENT,CECIL STREET,Struck Off,na,49705,na,X & H INTERNATIONAL PTE. LTD.,na,...,na,na,na,na,na,Local Company,na,na,na,1


#### Using the dataframe retrieved from data.gov.sg

```
1. Create an additional column, tokenized_postal_code, in the dataframe which will apply the tokenize_value 
method onto the postal code data as follows
```

Example output:

| postal_code | tokenized_postal_code |
| ----------- | --------------------- |
| 049392      | 0538130538135341|
| 670616      | 3489397003014856|
| 048545      | 6452782013415833|

In [68]:
######################
####YOUR CODE HERE####
######################

token_dict = TokenDict()
tokenised_df = acra_csv_df.copy()

### insert your code here

tokenised_df['tokenized_postal_code'] = tokenised_df['postal_code'].apply(lambda x: token_dict.tokenize_value(x))
tokenised_df[['postal_code','tokenized_postal_code']].head()

{'9057260758655994': '049392', '3525674300230380': '670616', '9935156876299481': '048545', '9850262299488759': '682691', '1658853590117755': '049705', '4512846446343969': '039594', '4291492366331456': '760757', '4732301866859955': '470149', '7869578734298180': '573969', '4335643244666753': '228149', '3031506454886644': '609601', '2784289584370208': '823268', '2180752210116410': '168732', '9508866917536873': '238841', '3510475665119744': '409051', '6637440383227885': '608586', '4854566428341602': '079903'}


ValueError: value is already in token dictionary

```
2. Print out columns entity_name, postal_code and tokenized_postal_code of 049392
```

In [63]:
condition = tokenised_df['postal_code']=='048545'
tokenised_df[['entity_name','postal_code','tokenized_postal_code']][condition]

Unnamed: 0,entity_name,postal_code,tokenized_postal_code
2,X & E PASSION PTE. LTD.,48545,8272564930999218
485,X-LOGISTICS,48545,8272564930999218
599,X-RITE ASIA PACIFIC PTE LTD,48545,8272564930999218
1106,XAT SOLUTIONS PTE. LTD.,48545,8272564930999218
1580,XCLUSIVEMUM,48545,8272564930999218
1783,XELLERZ & XALES,48545,8272564930999218
3158,XIANG YUN LAMPS TRADING,48545,8272564930999218
3584,XIAOYUN LLP,48545,8272564930999218
5326,XIN YAO GOLDEN SANDS ASSET MANAGEMENT PTE. LTD.,48545,8272564930999218
5517,XIN YUAN INTERNATIONAL INVESTMENT PTE. LTD.,48545,8272564930999218


## End of Exercise