# Overlap Thought Expriment
A 100,000 user dataset with PII from a company called Finn Corp is compared to a 500,000 dataset with PII that includes demographic data. The goal of this overlap analysis is to first see what the % overlap is and second define demographic information.

## Business Questions
- What is the percent overlap between the 2 datasets?
- What is the demographic information of the matched dataset?

In [1]:
# Import Modules
import pandas as pd
import numpy as np
from everyone_anonymous import something_nothing as sn
from everyone_anonymous import add_something as asome

# Set parameters
finn_corp_size = 10
demo_data_size = 20
overlap_per = .50
overlap = int(overlap_per * finn_corp_size)
non_overlap = int(demo_data_size - overlap)

In [2]:
# Create Finn Corp Datasets
finn_corp_obj = {
            'First_Name': [],
            'Last_Name': [],
            'Email': []
        }

finn_corp = sn(data=finn_corp_obj,nrows=finn_corp_size).generate_data()
finn_corp_df = pd.DataFrame(finn_corp)
finn_corp_df.head()

Unnamed: 0,First_Name,Last_Name,Email
0,Leilani,Jamaria,LeilaniJamaria@gmail.com
1,Avion,Kalaya,AvionKalaya@gmail.com
2,Chloe,Kayla,ChloeKayla@gmail.com
3,Cadyn,Shea,CadynShea@gmail.com
4,Liliana,Hope,LilianaHope@gmail.com


In [3]:
# Create Overlapped Demo Dataset
overlap_demo_obj = {
            'First_Name': finn_corp_obj['First_Name'][:overlap],
            'Last_Name': finn_corp_obj['Last_Name'][:overlap],
            'Email': finn_corp_obj['Email'][:overlap],
        }

demo_data_overlap = asome(data=overlap_demo_obj,nrows=overlap).add_demo_data()
demo_data_df = pd.DataFrame(demo_data_overlap)

Unnamed: 0,First_Name,Last_Name,Email,Gender,Age,HHI
0,Leilani,Jamaria,LeilaniJamaria@gmail.com,Female,74,"\$200,001 - $500,000"
1,Avion,Kalaya,AvionKalaya@gmail.com,Male,69,"\$200,001 - $500,000"
2,Chloe,Kayla,ChloeKayla@gmail.com,Male,54,"\$50,001 - $100,000"
3,Cadyn,Shea,CadynShea@gmail.com,Female,64,"More than $500,000"
4,Liliana,Hope,LilianaHope@gmail.com,Female,60,"\$200,001 - $500,000"


In [4]:
# Create Non-overlapped Demo Dataset
nonoverlap_demo_obj = {
            'First_Name': [],
            'Last_Name': [],
            'Email': [],
            'Age': [],
            'Gender': [],
            'HHI': []
        }

demo_data_nooverlap = sn(data=nonoverlap_demo_obj,nrows=non_overlap).generate_data()
demo_data_nooverlap_df = pd.DataFrame(demo_data_nooverlap)

Unnamed: 0,First_Name,Last_Name,Email,Age,Gender,HHI
0,Clark,Ximena,ClarkXimena@gmail.com,57,Female,"\$100,001 - $200,000"
1,Kathleen,Braeden,KathleenBraeden@gmail.com,19,Female,"\$200,001 - $500,000"
2,Jakai,Kristian,JakaiKristian@gmail.com,61,Female,"\$100,001 - $200,000"
3,Phillip,Theodore,PhillipTheodore@gmail.com,43,Male,"Less than $50,000"
4,Shea,Milana,SheaMilana@gmail.com,56,Male,"\$200,001 - $500,000"
5,Taylor,Avamarie,TaylorAvamarie@gmail.com,65,Female,"More than $500,000"
6,Kooper,Kendahl,KooperKendahl@gmail.com,32,Male,"More than $500,000"
7,Alexavier,Mylee,AlexavierMylee@gmail.com,47,Male,"More than $500,000"
8,America,Grayson,AmericaGrayson@gmail.com,76,Male,"Less than $50,000"
9,Isabella,Kendall,IsabellaKendall@gmail.com,38,Male,"More than $500,000"


In [5]:
# Create full demo data set.
for i in range(len(demo_data_nooverlap['Email'])):
    demo_data_overlap['First_Name'].append(demo_data_nooverlap['First_Name'][i])
    demo_data_overlap['Last_Name'].append(demo_data_nooverlap['Last_Name'][i])
    demo_data_overlap['Email'].append(demo_data_nooverlap['Email'][i])
    demo_data_overlap['Age'].append(demo_data_nooverlap['Age'][i])
    demo_data_overlap['Gender'].append(demo_data_nooverlap['Gender'][i])
    demo_data_overlap['HHI'].append(demo_data_nooverlap['HHI'][i])
    
demo_data = demo_data_overlap
demo_data_df = pd.DataFrame(demo_data)
demo_data_df.head(20)

Unnamed: 0,First_Name,Last_Name,Email,Gender,Age,HHI
0,Leilani,Jamaria,LeilaniJamaria@gmail.com,Female,74,"\$200,001 - $500,000"
1,Avion,Kalaya,AvionKalaya@gmail.com,Male,69,"\$200,001 - $500,000"
2,Chloe,Kayla,ChloeKayla@gmail.com,Male,54,"\$50,001 - $100,000"
3,Cadyn,Shea,CadynShea@gmail.com,Female,64,"More than $500,000"
4,Liliana,Hope,LilianaHope@gmail.com,Female,60,"\$200,001 - $500,000"
5,Clark,Ximena,ClarkXimena@gmail.com,Female,57,"\$100,001 - $200,000"
6,Kathleen,Braeden,KathleenBraeden@gmail.com,Female,19,"\$200,001 - $500,000"
7,Jakai,Kristian,JakaiKristian@gmail.com,Female,61,"\$100,001 - $200,000"
8,Phillip,Theodore,PhillipTheodore@gmail.com,Male,43,"Less than $50,000"
9,Shea,Milana,SheaMilana@gmail.com,Male,56,"\$200,001 - $500,000"
