### Imports:

In [9]:
from flask import Flask, send_from_directory, url_for, jsonify
import os
import numpy as np
import pandas as pd
from flask import Flask, flash, request, redirect, render_template
from werkzeug.utils import secure_filename
# from anonymizer import Anonymizer
# from demographics_anon import *


import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")


from faker import Faker
from anonymization import Anonymization, AnonymizerChain, PhoneNumberAnonymizer, msisdnAnonymizer, NamedEntitiesAnonymizer,FilePathAnonymizer, EmailAnonymizer, UriAnonymizer,MacAddressAnonymizer,Ipv4Anonymizer,Ipv6Anonymizer

import string
import random
from faker import Factory
from random import randint
import datetime

### Anonymizer 
    Structured - Getting Fake Names, Cities, Countries, Adresses & URIs
    Unstructured - get_anonymize_text - Queries AnonymizerChain 

In [20]:
class Anonymizer:
    def __init__(self):
        self.faker = Faker()


    def fake_name_generator(self, n):
        fake_names = []
        fake_first_names = []
        fake_second_names = []
        for i in range(0, n):
            name = self.faker.name()
            fake_names.append(name)
            fake_first_names.append(name.split()[0])
            if len(name.split())>1:
                fake_second_names.append(name.split()[1])
            else:
                fake_second_names.append('')
        return fake_names,fake_first_names,fake_second_names


    def get_fake_cities(self,n):
        fake_cities = []
        for i in range(0, n):
            fake_cities.append(self.faker.city())
        return fake_cities


    def get_fake_countries(self,n):
        fake_countries = []
        for i in range(0,n):
             fake_countries.append(self.faker.country())
        return fake_countries


    def get_fake_addresses(self,n):
        fake_addresses = []
        for i in range(0,n):
            fake_addresses.append(self.faker.address().split('\n')[0])
        return fake_addresses


    def get_fake_uris(self,n):
        fake_uris = []
        for i in range(0,n):
            fake_uris.append(self.faker.uri())
        return fake_uris

    def get_anonymize_text(self,query):
        spacy.load('en_core_web_sm')
        anon = AnonymizerChain(Anonymization('en'))
        anon.add_anonymizers(FilePathAnonymizer,\
                            EmailAnonymizer, UriAnonymizer,MacAddressAnonymizer,Ipv4Anonymizer, Ipv6Anonymizer,\
                            NamedEntitiesAnonymizer('en'))
        anonymizedText = anon.anonymize(query)
        return anonymizedText


### Fake details:
    Names, Middle initials, city, ethnicity, sate zip, preferred communication, marital status, Gender Suffix (Mr. Mrs etc)
    ID, Address, Driving License, SSN, Deceased flag (based on DOB) -gives DOD & DOB, Get Phone #
    generate_records - Generates fake records for non-Indian race with Indian names
    create_call_record - Create a random call record for the same id
    is_excluded_location - check if original location need to be excluded from anonymizing
    get_anon_data - Replace the original rows with randomly generated data using generate_records function.

In [3]:
fake = Factory.create()

def get_indian_males():
    gender_names = []
    with open('indian_male.txt') as f:
        names = f.readlines()
    for name in names:
        gender_names.append(name.replace('\n', ''))
    return gender_names

def get_indian_females():
    gender_names = []
    with open('indian_female.txt') as f:
        names = f.readlines()
    for name in names:
        gender_names.append(name.replace('\n', ''))
    return gender_names

indian_males = get_indian_males()
indian_females = get_indian_females()

def get_indian_name(gender, indian_males, indian_females):
    if gender.lower() == 'm':
        return random.choice(indian_males)
    else:
        return random.choice(indian_females)

def get_middle_initials():
    return random.choice(string.ascii_letters).upper()

def get_city(address):
    if '\n' in address and ',' in address:
        return address.split('\n')[1].split(',')[0]
    return ''

def get_ethnicity():
    ethnicities = ['Asian', 'Latino', 'Hispanic', 'Indian', 'African', 'American', 'White', 'Pacific', 'Islander', 'Alaska', 'Native']
    return random.choice(ethnicities)

def get_state_zip(address):
    if '\n' in address and ',' in address and 'APO' not in address:
        if len(address.split('\n')[1].split())>2:
           return address.split('\n')[1].split(',')[1].strip().split()[0], address.split('\n')[1].split(',')[1].strip().split()[1]
        return '', ''
    return '', ''

def get_preferred_communication():
    comm = ['Cell Phone', 'Land Line', 'Email', 'Mail']
    return random.choice(comm)

def get_maritalstatus():
    maritalstatus = ['Married', 'Single', 'Divorced', 'Widowed']
    return random.choice(maritalstatus)

def get_suffix(gender):
    male_suffix = ['Dr.', 'Mr.']
    female_suffix = ['Dr.', 'Mrs.', 'Miss']
    if gender == 'M':
        return random.choice(male_suffix)
    elif gender == 'F':
        return random.choice(female_suffix)
    else:
        return ''

def get_id():
    return str(randint(100000, 999999))

def get_address_line1(address):
    return address.split('\n')[0]

def get_driverlicense():
    return random.choice(string.ascii_letters).upper() + str(randint(1000000, 9999999))

def get_ssn():
    return str(randint(100, 999)) + "-" + str(randint(10, 99)) + "-" + str(randint(1000, 9999))

def get_deceased_flag(dob):
    deceasedyear = [2018, 2017, 2016]
    if dob.day<28:
        death_date = datetime.date(year = random.choice(deceasedyear), month = dob.month, day = dob.day)
    else:
        death_date = datetime.date(year=random.choice(deceasedyear), month=dob.month, day=28)
    deceasedstatus = ['Y', 'N', 'N', 'N', 'N', 'N']
    deceased_flag = random.choice(deceasedstatus)
    if deceased_flag == 'Y':
        if dob.day < 28:
            dob = datetime.date(dob.year - 5, dob.month, dob.day)
        else:
            dob = datetime.date(dob.year - 5, dob.month, 25)
        return deceased_flag, death_date, dob
    return deceased_flag, '', dob

def gen_phone():
    first = str(random.randint(100,999))
    second = str(random.randint(1,888)).zfill(3)
    last = (str(random.randint(1,9998)).zfill(4))
    while last in ['1111','2222','3333','4444','5555','6666','7777','8888']:
        last = (str(random.randint(1,9998)).zfill(4))
    return '{}-{}-{}'.format(first,second, last)

def generate_records(gender, race_type = 'non-indian'):
    person = fake.simple_profile(sex=gender)
    record = ''
    id = get_id()
    city = ''
    state = ''
    zip = ''
    if ' AE ' not in person['address']:
        email = person['mail']
        middle_name = get_middle_initials()
        if race_type == 'indian':
            name = get_indian_name(gender, indian_males, indian_females)
            print("name:", name)
            first_name = name.split()[0]
            last_name = name.split()[1]
        else:
            name = person['name']
            first_name = person['name'].split()[0]
            last_name = ""
        sex = person['sex']
        city = get_city(person['address'])
        state, zip = get_state_zip(person['address'])
        country = "USA"
        if len(name.split())==2:
            name = ' '.join([name.split()[0], middle_name, name.split()[1]])
        address_line1 = get_address_line1(person['address'])
        dob = person['birthdate']
        driverlicense = get_driverlicense()
        martial_status = get_maritalstatus()
        suffix = get_suffix(sex)
        phone_cell = gen_phone()
        phone_fax = gen_phone()
        phone_home = gen_phone()
        ssn = get_ssn()
        preferred_communication = get_preferred_communication()
        ethnicity = get_ethnicity()
        if len(person['name'].split()) > 1 and race_type != 'indian':
            last_name = person['name'].split()[len(person['name'].split()) - 1]

        deceased_flag, death_date, dob = get_deceased_flag(dob)
        if state != '':
            record = id + "|" + name + "|" + first_name + "|" + middle_name + "|" + last_name + "|" + suffix + "|" + martial_status + "|" + preferred_communication + "|" + ethnicity + "|" + email+ "|" + sex + "|"+ address_line1 + "|" + city + "|" + state +"|" + zip + "|" + country + "|" + driverlicense + "|" + str(dob) + "|" + deceased_flag + "|" + str(death_date) + "|" + phone_cell + "|" + phone_fax + "|" + phone_home + "|" + ssn
    return record, city, state, zip

def create_call_record(id):
    attempted_datetime_random = random.choice(attempted_datetime)
    attempted_datetime_random = attempted_datetime_random.replace('2019', random.choice(['2018', '2019']))
    attempted_datetime_day = attempted_datetime_random[attempted_datetime_random.rfind('-'):attempted_datetime_random.index(' ')]
    attempted_datetime_random = attempted_datetime_random.replace(attempted_datetime_day, random.choice(['-01', '-02', '-03', '-04', '-05', '-06', '-07', '-08', '-09', '-10', '-11', '-12']))
    attempted_datetime_sec = attempted_datetime_random[attempted_datetime_random.rfind(':'):]
    attempted_datetime_random = attempted_datetime_random.replace(attempted_datetime_sec, ":" + str(random.choice(range(10, 59))))
    phone_source_random = random.choice(phone_source)
    call_result_random = random.choice(call_result)
    phone_type_random = random.choice(phone_type)
    record = str(id) + "|" + phone_type_random + "|" + phone_source_random + "|" + call_result_random + "|" + str(attempted_datetime_random)
    return record

def is_excluded_location(locations, city, state, zip):
    if city.lower() in locations or state.lower() in locations or zip.lower() in locations:
        return True
    else:
        return False

def get_anon_data(rows, male_per = 0.5, exclude_locations = [], race_type = 'non-indian'):
    header = "id|name|first_name|middle_name|last_name|suffix|martial_status|preferred_communication|ethnicity|email|sex|address_line1|city|state|zip|country|driverlicense|dob|deceased_flag|death_date|phone_cell|phone_fax|phone_home|ssn"
    ids_generated = []
    male_rows = (int)(male_per * rows)
    female_rows = rows - male_rows
    records = {}
    count = 0
    data = []
    while (count != male_rows):
        male_record, city, state, zip = generate_records('M', race_type)
        if male_record != '' and not is_excluded_location(exclude_locations, city, state, zip):
            data.append(male_record)
            count = count + 1
    records["male_n"]  = count
    count = 0
    while (count != female_rows):
        female_record, city, state, zip = generate_records('F', race_type)
        if female_record != '' and not is_excluded_location(exclude_locations, city, state, zip):
            data.append(female_record)
            count = count + 1
    records["female_n"] = count
    records["data"] = data
    records["delimiter"] = "|"
    records["exclusion"] = exclude_locations
    return records

### Application:
    On the form - 
        Upload data -allowed extensions
        Anonymize - Fake all the records
    
    

In [29]:
UPLOAD_FOLDER = os.getcwd()

app = Flask(__name__, static_folder='static', static_url_path='/static')
app.secret_key = "secret key"
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['TEMPLATES_AUTO_RELOAD'] = True

filepath = ""

ALLOWED_EXTENSIONS = set(['csv'])

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS


@app.route('/index_full')
def index_full():
    return render_template('index_full.html')

@app.route('/get_data')
def get_data():
    rows = request.args.get('n')
    male_percentage = request.args.get('m')
    exclude_locations = request.args.get('exclude_loc')
    print(exclude_locations)
    records = get_anon_data(int(rows), float(male_percentage), exclude_locations, 'non-indian')
    return records

@app.route('/get_indian_data')
def get_indian_data():
    rows = request.args.get('n')
    male_percentage = request.args.get('m')
    exclude_locations = request.args.get('exclude_loc')
    print(exclude_locations)
    records = get_anon_data(int(rows), float(male_percentage), exclude_locations, 'indian')
    return records

@app.route('/get_modified_data', methods=['POST'])
def get_modified_data():
    if request.method == "POST":
        rows = request.form['rows']
        male_percentage = request.form['percent']
        exclude_loc = request.form['exclude_loc']
        records = get_anon_data(int(rows), float(male_percentage), exclude_loc, 'non-indian')
        df = pd.DataFrame.from_dict(records['data'])
        df.columns = ['id|name|first_name|middle_name|last_name|suffix|martial_status|preferred_communication|ethnicity|email|sex|address_line1|city|state|zip|country|driverlicense|dob|deceased_flag|death_date|phone_cell|phone_fax|phone_home|ssn']
        df.to_csv('static/generated_data_records.csv', index = None, header=True)
        link = "<a href='static/generated_data_records.csv' class='link_dwn' download='generated_data_records.csv'>Save File</a>"
        return jsonify({"status":"success","response":link})

    else:
        return "Kindly do post request"



@app.route('/')
def index():
    return render_template('index.html')


@app.route('/', methods=['POST'])
def main():
    if request.method == 'POST':
        if request.form['submit'] == 'Upload':
            return upload_file()
        elif request.form['submit'] == 'Anonymize':
            return anonymize()
        elif request.form['submit'] == 'Generate Data':
            return generate_data()
    else:
        return "main"


def upload_file():
    if 'file' not in request.files:
        flash('No file part')
        return redirect(request.url)
    file = request.files['file']
    if file.filename == '':
        flash('No file selected for uploading')
        return redirect(request.url)
    if file and allowed_file(file.filename):
        filename = secure_filename(file.filename)
        file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
        global filepath
        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        print(filepath)
        flash('File successfully uploaded:' + filepath)
        data = pd.read_csv(filepath, sep='|')
        headers = list(data)
        return render_template('index.html',dropdown = headers)
    else:
        flash('Allowed file type is csv')
        return redirect(request.url)


def anonymize():
    try:
        global filepath
        data = pd.read_csv(filepath,sep='|')
        rows, c = data.shape
        selected_name = request.form.get('options_name')
        selected_city = request.form.get('options_city')
        selected_country = request.form.get('options_country')
        selected_address = request.form.get('options_address')
        selected_url = request.form.get('options_uri')

        anonymize_data = Anonymizer()

        if selected_name != 'None':
            fake_names,fake_first_names,fake_second_names = anonymize_data.fake_name_generator(rows)
            data[str(selected_name)] = fake_names
        if selected_city != 'None':
            data[str(selected_city)] = anonymize_data.get_fake_cities(rows)
        if selected_country != 'None':
            data[str(selected_country)] = anonymize_data.get_fake_countries(rows)
        if selected_address != 'None':
            data[str(selected_address)] = anonymize_data.get_fake_addresses(rows)
        if selected_url != 'None':
            data[str(selected_url)] = anonymize_data.get_fake_uris(rows)

        data.to_csv ('static/export_data.csv', sep='|', index = None, header=True)
        flash('File successfully anonymized')
        return render_template('index.html', path_anonymize = 'static/export_data.csv')
    except:
        return render_template('index.html')


def generate_data():
    try:
        generate_data = pd.DataFrame()
        rows = int(request.form.get('num_rows'))
        anonymize_data = Anonymizer()
        fake_names,fake_first_names,fake_second_names = anonymize_data.fake_name_generator(rows)
        generate_data['name'] = fake_names
        generate_data['first_name'] = fake_first_names
        generate_data['last_name'] = fake_second_names
        generate_data['city'] = anonymize_data.get_fake_cities(rows)
        generate_data['country'] = anonymize_data.get_fake_countries(rows)
        generate_data['street_address'] = anonymize_data.get_fake_addresses(rows)
        generate_data['url'] = anonymize_data.get_fake_uris(rows)
        generate_data.to_csv ('static/generated_data.csv', sep='|', index = None, header=True)
        flash('File successfully generated')
        return render_template('index.html', path_generated_data = 'static/generated_data.csv')

    except:
        return render_template('index.html')


@app.route('/unstructured', methods=['POST'])
def unstructured():
    if request.method == "POST":
        if request.form['inputText']:
            return get_structured_data()
    else:
        return "unstructured"

def get_structured_data():
    inputText = request.form['inputText']
    print(inputText)
    anonymize_data = Anonymizer()
    anonymizedText = anonymize_data.get_anonymize_text(inputText)
    return jsonify({"status":"success","response":anonymizedText})

    
if __name__ == "__main__":
    app.run(host='0.0.0.0', port=8080)

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:8080/ (Press CTRL+C to quit)


### Test

In [21]:
# Unstructured Data:

inputText = "The great thing about this setup (unlike the pricing models of web hosting companies such as Bluehost and Godaddy) is that you only pay for the storage and bandwidth that you use."
print(inputText)
anonymize_data = Anonymizer()
anonymizedText = anonymize_data.get_anonymize_text(inputText)
return jsonify({"status":"success","response":anonymizedText})

The great thing about this setup (unlike the pricing models of web hosting companies such as Bluehost and Godaddy) is that you only pay for the storage and bandwidth that you use.


OSError: [E050] Can't find model 'en'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [26]:
filepath = "./static/generated_data.csv"
data = pd.read_csv(filepath,sep='|')
rows, c = data.shape
selected_name = "options_name"
selected_city = "options_city"
selected_country ="options_country"
selected_address = "options_address"
selected_url = "options_uri"

anonymize_data = Anonymizer()

if selected_name != 'None':
    fake_names,fake_first_names,fake_second_names = anonymize_data.fake_name_generator(rows)
    data[str(selected_name)] = fake_names
if selected_city != 'None':
    data[str(selected_city)] = anonymize_data.get_fake_cities(rows)
if selected_country != 'None':
    data[str(selected_country)] = anonymize_data.get_fake_countries(rows)
if selected_address != 'None':
    data[str(selected_address)] = anonymize_data.get_fake_addresses(rows)
if selected_url != 'None':
    data[str(selected_url)] = anonymize_data.get_fake_uris(rows)

data.to_csv ('static/export_data.csv', sep='|', index = None, header=True)
print('File successfully anonymized')

File successfully anonymized
