In [1]:
!pip install names

Defaulting to user installation because normal site-packages is not writeable
Collecting names
  Downloading names-0.3.0.tar.gz (789 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m789.1/789.1 KB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: names
  Building wheel for names (setup.py) ... [?25ldone
[?25h  Created wheel for names: filename=names-0.3.0-py3-none-any.whl size=803682 sha256=adbe714772b3f50c39e3fb2a3ec9f1491ade046e10077eb4aa7cf542125161b3
  Stored in directory: /home/pori/.cache/pip/wheels/fc/9a/6f/78f4282bbcaa2d8c678b73c54c0bb1b7a04009f0d7cec79fce
Successfully built names
Installing collected packages: names
Successfully installed names-0.3.0


In [2]:
import pandas as pd
import names as nm
import random

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Synthetic dataset

In [3]:
def get_names(is_first_name, amount):
  names = []
  while len(names) < amount:
    name = nm.get_first_name() if is_first_name else nm.get_last_name()
    if name not in names:
      names.append(name)
  return names
   
first_names = get_names(True, 100)
last_names = get_names(False, 400)
first_names[:5], last_names[:5]

(['Mitchell', 'Irene', 'June', 'Robert', 'Lois'],
 ['Vincent', 'Jentzsch', 'Diggs', 'Straley', 'Oliver'])

In [4]:
all_jurisdictions = ['Australia', 'Belgium', 'Brazil', 'Bulgaria', 'Cyprus', 'Czech Republic',
                  'Guatemala', 'Mexico', 'Honduras', 'Costa Rica', 'Colombia', 'Greece', 'Hungary',
                  'India', 'Indonesia', 'Ireland', 'Italy', 'Syria', 'Japan', 'Latvia', 'Lithuania',
                  'Luxembourg', 'Malta', 'Jamaica', 'Ireland', 'Turkey', 'United Kingdom',
                  'United States']

In [5]:
def generate_people_dataset(num):
  data = []
  for _ in range(num):
    data.append((random.choice(first_names),random.choice(last_names), random.choice(all_jurisdictions)))
  return pd.DataFrame(data, columns=['first_name', 'last_name', 'country'])

df_people = generate_people_dataset(500)
df_people

Unnamed: 0,first_name,last_name,country
0,Andy,Jones,Czech Republic
1,Herman,Barnes,Syria
2,Randy,Carruthers,Indonesia
3,Brooke,Worthington,Luxembourg
4,Marion,Lewis,Brazil
...,...,...,...
495,Shannon,Harris,Ireland
496,Cory,Stevenson,Japan
497,Bonnie,Wirkkala,Costa Rica
498,Gregory,Lundy,Lithuania


## Creating features for KYC synthetic dataset

In [6]:
# to track relatives if money laundering goes through them too
famous_money_laundering_lastnames = last_names[:30]
famous_money_laundering_lastnames

['Vincent',
 'Jentzsch',
 'Diggs',
 'Straley',
 'Oliver',
 'Glass',
 'Eberts',
 'Brown',
 'Cox',
 'Farmer',
 'Bono',
 'Fields',
 'Mathewson',
 'Hill',
 'Johnson',
 'Smith',
 'Enders',
 'Tate',
 'Kropf',
 'Holland',
 'Davis',
 'Kang',
 'Vega',
 'Pino',
 'Ashby',
 'Dickinson',
 'Steele',
 'Rader',
 'Thompson',
 'Ragsdale']

In [7]:
# Random example of prohibited jurisdictions for specific event
prohibited_jurisdictions = ['United States', 'Syria']                

In [8]:
def random_bool_with(percentage):
  return random.randint(1, 100) < percentage

def generate_feature_dataset():
  data = []
  for i, row in df_people.iterrows():
    pass_kyc = True
    features = {}

    # Money launderer's namesake could potentially be his relative
    features['possible_relative_for_money_laundering'] = (row['last_name'] in famous_money_laundering_lastnames)
    if features['possible_relative_for_money_laundering']:
      pass_kyc &= random_bool_with(percentage=60)

    # If a person is from a prohibited jurisdiction for this event, then KYC fails
    features['is_prohibited_jurisdiction'] = (row['country'] in prohibited_jurisdictions)
    if features['is_prohibited_jurisdiction']:
      pass_kyc = False

    # Using a VPN to hide your true jurisdiction is suspect
    features['with_vpn'] = random_bool_with(percentage=20)
    if features['with_vpn']:
      pass_kyc &= random_bool_with(percentage=85)

    # If a large amount of time is spent on KYC, it is suspicious
    # Also, if the sending time is close to zero, then it is possible bot/script
    features['sending_time_sec'] = random.randint(0, 200)
    if features['sending_time_sec'] > 160:
      pass_kyc &= random_bool_with(percentage=70)
    if features['sending_time_sec'] < 5:
      pass_kyc &= random_bool_with(percentage=10)

    # If the photo of the document is perfect quality,
    # then there is a high probability of a pre-prepared photo using virtual camera
    features['photo_quality'] = random.randint(0, 10)
    if features['photo_quality'] == 10:
      pass_kyc &= random_bool_with(percentage=10)

    
    features['pass_kyc'] = pass_kyc
    data.append(features)
  return pd.DataFrame(data)

df = generate_feature_dataset()
df  

Unnamed: 0,possible_relative_for_money_laundering,is_prohibited_jurisdiction,with_vpn,sending_time_sec,photo_quality,pass_kyc
0,False,False,False,132,10,False
1,False,True,False,100,3,False
2,False,False,False,171,0,False
3,False,False,False,162,0,True
4,False,False,False,111,3,True
...,...,...,...,...,...,...
495,False,False,True,16,1,True
496,False,False,False,196,7,True
497,False,False,False,117,0,True
498,False,False,False,172,2,True


## Train Model(DecisionTreeClassifier) using features from our synthetic dataset

In [9]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

y = df['pass_kyc']
X = df.drop(columns=['pass_kyc'])
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(x_train), len(x_test)

(400, 100)

In [10]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

In [11]:
y_pred = clf.predict(x_test)
print("f1-score:", f1_score(y_test, y_pred))

f1-score: 0.8671328671328671


## Converting DecisionTree model to Leo programming language

In [12]:
import numpy as np
import math
from sklearn import tree

def dt_to_leo_code(clf: tree.DecisionTreeClassifier, program_name: str):
  n_nodes = clf.tree_.node_count
  children_left = clf.tree_.children_left
  children_right = clf.tree_.children_right
  feature = clf.tree_.feature
  threshold = clf.tree_.threshold
  values = [np.argmax(value[0]) for value in clf.tree_.value]

  node_depth = np.zeros(shape=n_nodes, dtype=np.int64)
  is_leaves = np.zeros(shape=n_nodes, dtype=bool)
  stack = [(0, 0)] 
  while len(stack) > 0:
      node_id, depth = stack.pop()
      node_depth[node_id] = depth
      is_split_node = children_left[node_id] != children_right[node_id]
      if is_split_node:
          stack.append((children_left[node_id], depth + 1))
          stack.append((children_right[node_id], depth + 1))
      else:
          is_leaves[node_id] = True


  def build_code(i):
    if is_leaves[i]:
      return node_depth[i] * "\t" + f"return {values[i]}u32;\n"
    leo_code = ""
    leo_threshold = math.ceil(threshold[i])
    comp = "<" if int(threshold[i]) != threshold[i] else "<="
    leo_code += node_depth[i] * "\t" + f"if (p{(feature[i] + 1)} {comp} {leo_threshold}u32) {{\n"
    leo_code += build_code(children_left[i])
    leo_code += node_depth[i] * "\t" + "} else {\n"
    leo_code += build_code(children_right[i])
    leo_code += node_depth[i] * "\t" + "}\n" 
    return leo_code

  leo_code = f"program {program_name} {{\n" 
  leo_code += "\t" + "// Code auto generated from DecisionTreeClassifier using dt_to_leo_code.py \n"
  leo_code += "\t" + "transition main("
  for i in range(1, clf.n_features_in_ + 1):
    leo_code += f"p{i}: u32" + (", " if i != clf.n_features_in_ else ") -> public u32 {\n")

  node_depth += 2
  leo_code += build_code(0)
  leo_code += "\t}\n}" 
  return leo_code

print(dt_to_leo_code(clf, "dt.aleo"))

program dt.aleo {
	// Code auto generated from DecisionTreeClassifier using dt_to_leo_code.py 
	transition main(p1: u32, p2: u32, p3: u32, p4: u32, p5: u32) -> public u32 {
		if (p5 < 10u32) {
			if (p2 < 1u32) {
				if (p4 < 174u32) {
					if (p4 < 5u32) {
						return 0u32;
					} else {
						if (p1 < 1u32) {
							if (p3 < 1u32) {
								if (p4 < 170u32) {
									return 1u32;
								} else {
									if (p5 <= 1u32) {
										return 0u32;
									} else {
										return 1u32;
									}
								}
							} else {
								if (p4 <= 160u32) {
									if (p4 < 61u32) {
										if (p4 < 58u32) {
											if (p4 < 24u32) {
												return 1u32;
											} else {
												if (p4 < 47u32) {
													if (p5 <= 8u32) {
														return 0u32;
													} else {
														return 1u32;
													}
												} else {
													return 1u32;
												}
											}
										} else {
											return 0u32;
										}
									} else {
										if