In [1]:
# Standard Python Modules
import pandas as pd
import numpy as np
import math
# Custom Python Modules
from everyone_anonymouse import something_nothing as sn

# Anonymized Data Generation Tutorial
The section below outlines the **something_nothing**  class from the **everyone_anonymouse** python module. This calss is used to create and test FAKE PII datasets (these are NOT real people).

In [2]:
# Create first party PII dataset.

# The data object is meant to set up a framework to populate data into.
data = {
            'First_Name': [],
            'Last_Name': [],
            'Email': []
        }

finn_corp = sn(data,100000) # Create a 20 row FAKE PII dataset.
df = pd.DataFrame(data=finn_corp.generate_data()) # Convert the FAKE PII dataset into a dataframe.
df.head() # Show top 5 rows of the dataframe.

Unnamed: 0,First_Name,Last_Name,Email
0,Palmer,Avian,PalmerAvian@gmail.com
1,Liam,Arielle,LiamArielle@gmail.com
2,Landen,Sammy,LandenSammy@gmail.com
3,Bristol,Alicia,BristolAlicia@gmail.com
4,Addyson,Shane,AddysonShane@gmail.com


In [3]:
# Hash emails
finn_corp_hash = finn_corp.hash() # Insert hashed emails into a Python list (array)
finn_corp_hash[:5] # Show first 5 hashed emails in the array.

[-3531285944219343872,
 -6768250205393769143,
 -4528607031030122088,
 -6101865616948794025,
 8888192416867525128]

In [4]:
# Match hashed email. The below exercize test the something_nothing classes ability to match hashed values.

matched_email = finn_corp.match(finn_corp_hash[0]) # Match the first hashed email in the arr_hash list.
print('Email that is matched: '+str(matched_email)) # Print the results.
for i in finn_corp.generate_data()['Email']: # Loop through the emails in the FAKE PII dataset.
    if i == matched_email: # Check if the matched emails matches any of the emails in the dataset.
        email_ = i # Store the matched email in the **email** variable.
print('Email match check: '+str(email_))# Print the results.

# Does the hash **match_email** function work?
if matched_email == email_: 
    result = 'yes!' 
else:
    result = 'No. Get to work!'
print('')
print('Is everything working? The answer is '+str(result))
# The **Email that is matched** should be the same as the **Email match check**. If they don't match or 
# none is returned... Likely something is broken.



Email that is matched: LiamArielle@gmail.com
Email match check: LiamArielle@gmail.com

Is everything working? The answer is yes!


# Thought Expriment
When thinking of ways to anonymously identify users on the internet one of the most important questions to answer is how is scale going to be impacted? The goal of this thought experiment is to model what the hypothetical reach and frequency of targeting an anonymized audience would be. 

## Tools
The **something_nothing** class (tutorial above) is used to generate the FAKE PII dataset and match hashed emails.

## General Scenario
A dataset with PII from a large company called Finn Corp is hashed utilizing the email address. These hashed emails are utilized by the DSP to identify users that _login_ to a publishers website. 
### Supply Side Business Questions
- What percent of users login to a publisher website?
- Of those logged in users what percent of them are in our data set?

### Buy Side Business Questions
- What is the frequency per hashed email over the lifetime of a 30 day digital campaign?
- How much is spent to reach the matched emails with a 4 dollar CPM?

In [5]:
# Supply side business questions
hashed_size = len(finn_corp_hash)

# What percent of users login to a publisher website?
log_in_per = .50
print('Currently '+str(log_in_per*100)+'% of users login.')

# Of those logged in users what percent of them are in our dataset?
match_usr_per = .20
print('Of those logged in users '+str(match_usr_per*100)+'% of them are matched to our '+str(hashed_size)+' person dataset.')
num_usr = math.floor((log_in_per*match_usr_per)*hashed_size)
print('Number of matched users: '+str(num_usr))
match_usr = finn_corp_hash[:num_usr]

Currently 50.0% of users log in.
Of those logged in users 20.0% of them are matched to our 100000 person dataset.
Number of matched users: 10000


In [6]:
# Buy side business questions

# What is the frequency per hashed email over the lifetime of a 30 day digital campaign?
life_freq = 3
impressions = num_usr*life_freq
print('Campaign achieves a lifetime frequency of '+str(life_freq)+' and delivers '+str(impressions)+' impressions.')

# What is the CPM for a 5000 dollar media budget when factoring in reach and frequency?
cpm = 4
spend = cpm*(impressions/1000)
print('Campaign spends $'+str(spend)+' at a $'+str(cpm)+' CPM.')

Campaign achieves a lifetime frequency of 3 and delivers 30000 impressions.
Campaign spends $120.0 at a $4 CPM.
