# Chapter 9: Data Science

(w_pacb45.pdf, page 277)

##  Dealing with data

(w_pacb45.pdf, page 281)

Typically, when you deal with data, this is the path...

In [1]:
# Some debug options

debug = False
if debug:
    import sys
    print(sys.version)
    print()
    print(sys.path)

### Setting up the notebook

(w_pacb45.pdf, page 282)

First things first, we need to set up the notebook.
This means imports and a bit of configuration.

In [2]:
import json
import calendar
import random
from datetime import date, timedelta

import faker
import numpy as np
from pandas import DataFrame
from delorean import parse
import pandas as pd

# make the graphs nicer
#
# d:\data\github\gmacario\learning-python\.lpenv\lib\site-packages\IPython\core\interactiveshell.py:2910: FutureWarning: 
# mpl_style had been deprecated and will be removed in a future version.
# Use `matplotlib.pyplot.style.use` instead.
#
#  exec(code_obj, self.user_global_ns, self.user_ns)
#
# pd.set_option('display.mpl_style', 'default')
# matplotlib.pyplot.style.use('default')

### Preparing the data

We want to achieve the following data structure...

In [3]:
fake = faker.Faker()
# print(fake)
# print(type(fake))

In [4]:
usernames = set()
usernames_no = 1000
# Populate the set with 1000 unique usernames
while len(usernames) < usernames_no:
        usernames.add(fake.user_name())
        
# print(type(usernames))  # <class 'set'>
# print(len(usernames))
# print(usernames)

# type(usernames)           # <class 'set'>
# list(usernames)
# len(usernames)            # 1000

In [5]:
def get_random_name_and_gender():
    male = random.random() > 0.6     # 60% of users will be female
    if male:
        return fake.name_male(), 'M'
    else:
        return fake.name_female(), 'F'

def get_users(username):
    users = []
    for username in usernames:
        name, gender = get_random_name_and_gender()
        user = {
            'username': username,
            'name': name,
            'gender': gender,
            'email': fake.email(),
            'age': fake.random_int(min=18, max=90),
            'address': fake.address(),
        }
        users.append(json.dumps(user))
    return users
    
# get_random_name_and_gender()
users = get_users(usernames)
# users[:3]

In [6]:
# campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency

def get_type():
    # Just some gibberish internal codes
    types = ['AKX', 'BYU', 'GRZ', 'KTR']
    return random.choice(types)

# get_type()

def get_start_end_dates():
    duration = random.randint(1, 2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days=offset)
    end = start + timedelta(days=duration)
    
    def _format_date(date_):
        return date_.strftime("%Y%m%d")
    
    return _format_date(start), _format_date(end)

# get_start_end_dates()

def get_age():
    age = random.randint(20, 45)
    age -= age % 5
    diff = random.randint(5, 25)
    diff -= diff % 5
    return '{}-{}'.format(age, age + diff)

# get_age()

def get_gender():
    return random.choice(('M', 'F', 'B'))

# get_gender()

def get_currency():
    return random.choice(('GBP', 'EUR', 'USD'))

# [get_type(), get_start_end_dates(), get_gender(), get_currency()]

def get_campaign_name():
    separator = '_'
    type_ = get_type()
    start_end = separator.join(get_start_end_dates())
    age = get_age()
    gender = get_gender()
    currency = get_currency()
    return separator.join(
        (type_, start_end, age, gender, currency))

# get_campaign_name()
#
# Example output:
#
# 'KTR_20170526_20190322_35-40_F_GBP'

In [7]:
def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)
    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5))
    impressions = int(random.gauss(0.5 * 10**6, 2))
    return {
        'cmp_name': name,
        'cmp_bgt': budget,
        'cmp_spent': spent,
        'cmp_clicks': clicks,
        'cmp_impr': impressions,
    }

# get_campaign_data()
#
# Example output:
#
# {'cmp_bgt': 733642,
#  'cmp_clicks': 25152,
#  'cmp_impr': 500000,
#  'cmp_name': 'AKX_20180917_20200915_35-40_F_USD',
#  'cmp_spent': 690411}

Now that we have the data, it's time to put it all together:

In [8]:
def get_data(users):
    data = []
    for user in users:
        campaigns = [get_campaign_data()
                    for _ in range(random.randint(2, 8))]
        data.append({
            'user': user,
            'campaigns': campaigns,
        })
    return data

# users[:2]
# print(get_data(users))

### Cleaning the data

(w_pacb45.pdf, page 287)

Let's start cleaning the data

In [9]:
rough_data = get_data(users)
rough_data[:2]   # Let's take a peek

[{'campaigns': [{'cmp_bgt': 275259,
    'cmp_clicks': 36243,
    'cmp_impr': 500003,
    'cmp_name': 'GRZ_20161210_20171226_25-30_B_GBP',
    'cmp_spent': 227958},
   {'cmp_bgt': 819336,
    'cmp_clicks': 66083,
    'cmp_impr': 500000,
    'cmp_name': 'AKX_20170303_20180810_20-25_B_GBP',
    'cmp_spent': 411878},
   {'cmp_bgt': 769975,
    'cmp_clicks': 19145,
    'cmp_impr': 500001,
    'cmp_name': 'AKX_20180330_20190215_45-55_M_EUR',
    'cmp_spent': 640261},
   {'cmp_bgt': 916268,
    'cmp_clicks': 23969,
    'cmp_impr': 499997,
    'cmp_name': 'KTR_20161209_20170827_25-35_M_USD',
    'cmp_spent': 278581},
   {'cmp_bgt': 358064,
    'cmp_clicks': 40327,
    'cmp_impr': 499998,
    'cmp_name': 'KTR_20180101_20180829_35-40_B_USD',
    'cmp_spent': 299765},
   {'cmp_bgt': 712666,
    'cmp_clicks': 42941,
    'cmp_impr': 499999,
    'cmp_name': 'GRZ_20181020_20200609_20-40_B_EUR',
    'cmp_spent': 623051},
   {'cmp_bgt': 514400,
    'cmp_clicks': 55587,
    'cmp_impr': 499999,
    'cmp_

So, we now start working with it

In [10]:
data = []
for datum in rough_data:
    for campaign in datum['campaigns']:
        campaign.update({'user': datum['user']})
        data.append(campaign)

data[:2]               # let's take another peek

[{'cmp_bgt': 275259,
  'cmp_clicks': 36243,
  'cmp_impr': 500003,
  'cmp_name': 'GRZ_20161210_20171226_25-30_B_GBP',
  'cmp_spent': 227958,
  'user': '{"username": "scottbeth", "name": "Beth Miller", "gender": "F", "email": "kara71@fisher.com", "age": 85, "address": "0535 Whitney River\\nEast Timothy, AK 10346"}'},
 {'cmp_bgt': 819336,
  'cmp_clicks': 66083,
  'cmp_impr': 500000,
  'cmp_name': 'AKX_20170303_20180810_20-25_B_GBP',
  'cmp_spent': 411878,
  'user': '{"username": "scottbeth", "name": "Beth Miller", "gender": "F", "email": "kara71@fisher.com", "age": 85, "address": "0535 Whitney River\\nEast Timothy, AK 10346"}'}]

### Creating the DataFrame

(w_pacb45.pdf, page 289)

Now it's time to create the `DataFrame`:

In [11]:
df = DataFrame(data)
df.head()

#print(df.head())
#df
#print(df)

Unnamed: 0,cmp_bgt,cmp_clicks,cmp_impr,cmp_name,cmp_spent,user
0,275259,36243,500003,GRZ_20161210_20171226_25-30_B_GBP,227958,"{""username"": ""scottbeth"", ""name"": ""Beth Miller..."
1,819336,66083,500000,AKX_20170303_20180810_20-25_B_GBP,411878,"{""username"": ""scottbeth"", ""name"": ""Beth Miller..."
2,769975,19145,500001,AKX_20180330_20190215_45-55_M_EUR,640261,"{""username"": ""scottbeth"", ""name"": ""Beth Miller..."
3,916268,23969,499997,KTR_20161209_20170827_25-35_M_USD,278581,"{""username"": ""scottbeth"", ""name"": ""Beth Miller..."
4,358064,40327,499998,KTR_20180101_20180829_35-40_B_USD,299765,"{""username"": ""scottbeth"", ""name"": ""Beth Miller..."


In [12]:
df.count()

cmp_bgt       5003
cmp_clicks    5003
cmp_impr      5003
cmp_name      5003
cmp_spent     5003
user          5003
dtype: int64

In [13]:
df.describe()

Unnamed: 0,cmp_bgt,cmp_clicks,cmp_impr,cmp_spent
count,5003.0,5003.0,5003.0,5003.0
mean,502976.064961,40629.229462,499999.503698,248422.953428
std,289015.152557,21667.504049,1.995431,219516.864053
min,1378.0,144.0,499991.0,139.0
25%,252781.0,23346.5,499998.0,66580.0
50%,508065.0,37189.0,500000.0,186167.0
75%,750385.0,56236.5,500001.0,377536.0
max,999767.0,99028.0,500007.0,967956.0


Let's see which are the three campaigns with the highest and lowest budgets.

In [14]:
df.sort_values(by=['cmp_bgt'], ascending=False).head(3)

Unnamed: 0,cmp_bgt,cmp_clicks,cmp_impr,cmp_name,cmp_spent,user
1453,999767,12528,500000,KTR_20161115_20171206_30-55_B_EUR,476486,"{""username"": ""smithchris"", ""name"": ""Elizabeth ..."
3021,999490,36998,500002,AKX_20170520_20181123_20-40_F_GBP,608685,"{""username"": ""xschmidt"", ""name"": ""Tommy Walker..."
4042,998892,81356,499999,GRZ_20180306_20191127_35-55_F_USD,707698,"{""username"": ""michelescott"", ""name"": ""Christop..."


In [15]:
df.sort_values(by=['cmp_bgt'], ascending=False).tail(3)

Unnamed: 0,cmp_bgt,cmp_clicks,cmp_impr,cmp_name,cmp_spent,user
3623,1540,52165,499996,AKX_20170120_20180905_20-40_M_GBP,705,"{""username"": ""lucerosandra"", ""name"": ""Gary Boy..."
4522,1399,25188,499998,BYU_20161212_20181012_40-50_B_USD,884,"{""username"": ""mullenshannon"", ""name"": ""Jane Li..."
2162,1378,46018,500000,AKX_20161229_20170407_30-45_B_USD,767,"{""username"": ""prestonshelia"", ""name"": ""April H..."


#### Unpacking the campaign name

(w_pacb45.pdf, page 291)

Now it's time to increase the complexity up a bit.

In [16]:
def unpack_campaign_name(name):
    """Very optimistic method, assumes data in campaign name
    is always in good state"""
    type_, start, end, age, gender, currency = name.split('_')
    start = parse(start).date
    end = parse(end).date
    return type_, start, end, age, gender, currency

campaign_data = df['cmp_name'].apply(unpack_campaign_name)
campaign_cols = [
    'Type', 'Start', 'End', 'Age', 'Gender', 'Currency'
]
campaign_df = DataFrame(
    campaign_data.tolist(), columns=campaign_cols, index=df.index
)
campaign_df.head(3)

Unnamed: 0,Type,Start,End,Age,Gender,Currency
0,GRZ,2016-10-12,2017-12-26,25-30,B,GBP
1,AKX,2017-03-03,2018-10-08,20-25,B,GBP
2,AKX,2018-03-30,2019-02-15,45-55,M,EUR


In [17]:
# raise ValueError if doing `df = df.join(campaign_df)`

df2 = df
df2 = df2.join(campaign_df)
df2.head(4)

Unnamed: 0,cmp_bgt,cmp_clicks,cmp_impr,cmp_name,cmp_spent,user,Type,Start,End,Age,Gender,Currency
0,275259,36243,500003,GRZ_20161210_20171226_25-30_B_GBP,227958,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",GRZ,2016-10-12,2017-12-26,25-30,B,GBP
1,819336,66083,500000,AKX_20170303_20180810_20-25_B_GBP,411878,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",AKX,2017-03-03,2018-10-08,20-25,B,GBP
2,769975,19145,500001,AKX_20180330_20190215_45-55_M_EUR,640261,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",AKX,2018-03-30,2019-02-15,45-55,M,EUR
3,916268,23969,499997,KTR_20161209_20170827_25-35_M_USD,278581,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",KTR,2016-09-12,2017-08-27,25-35,M,USD


And after the join, we take a peek, hoping to seee matching data:

In [18]:
df2[['cmp_name'] + campaign_cols].head(3)

Unnamed: 0,cmp_name,Type,Start,End,Age,Gender,Currency
0,GRZ_20161210_20171226_25-30_B_GBP,GRZ,2016-10-12,2017-12-26,25-30,B,GBP
1,AKX_20170303_20180810_20-25_B_GBP,AKX,2017-03-03,2018-10-08,20-25,B,GBP
2,AKX_20180330_20190215_45-55_M_EUR,AKX,2018-03-30,2019-02-15,45-55,M,EUR


#### Unpacking the user data

(w_pacb45.pdf, page 293)

In [19]:
def unpack_user_json(user):
    """Very optimistic as well, expects user objects
    to have all attributes"""
    user = json.loads(user.strip())
    return [
        user['username'],
        user['email'],
        user['name'],
        user['gender'],
        user['age'],
        user['address'],
    ]

#df2['user']

user_data = df2['user'].apply(unpack_user_json)
# user_data

user_cols = ['username', 'email', 'name', 'gender', 'age', 'address']
# user_cols

user_df = DataFrame(
    user_data.tolist(), columns=user_cols, index=df2.index
)
user_df

Unnamed: 0,username,email,name,gender,age,address
0,scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
1,scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
2,scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
3,scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
4,scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
5,scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
6,scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
7,scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
8,murphydaniel,pbrown@bradley.com,John Yu PhD,M,53,"830 Wood Avenue Apt. 925\nCarolyntown, VI 8891..."
9,murphydaniel,pbrown@bradley.com,John Yu PhD,M,53,"830 Wood Avenue Apt. 925\nCarolyntown, VI 8891..."


In [20]:
df3 = df2
df3 = df3.join(user_df)

print('df.count()=', df.count())
print('df2.count()=', df2.count())
print('df3.count()=', df3.count())

df3[['user'] + user_cols].head(2)


df.count()= cmp_bgt       5003
cmp_clicks    5003
cmp_impr      5003
cmp_name      5003
cmp_spent     5003
user          5003
dtype: int64
df2.count()= cmp_bgt       5003
cmp_clicks    5003
cmp_impr      5003
cmp_name      5003
cmp_spent     5003
user          5003
Type          5003
Start         5003
End           5003
Age           5003
Gender        5003
Currency      5003
dtype: int64
df3.count()= cmp_bgt       5003
cmp_clicks    5003
cmp_impr      5003
cmp_name      5003
cmp_spent     5003
user          5003
Type          5003
Start         5003
End           5003
Age           5003
Gender        5003
Currency      5003
username      5003
email         5003
name          5003
gender        5003
age           5003
address       5003
dtype: int64


Unnamed: 0,user,username,email,name,gender,age,address
0,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"
1,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",scottbeth,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346"


In [21]:
better_columns = [
    'Budget', 'Clicks', 'Impressions',
    'cmp_name', 'Spent', 'user',
    'Type', 'Start', 'End',
    'Target Age', 'Target Gender', 'Currency',
    'Username', 'Email', 'Name',
    'Gender', 'Age', 'Address'
]
df3.columns = better_columns
# df3

Completing the `datasetNext` step will be to add some extra columns.

In [22]:
def calculate_extra_columns(df):
    # Click Through Rate
    df['CTR'] = df['Clicks'] / df ['Impressions']
    # Cost Per Click
    df['CPC'] = df['Spent'] / df ['Clicks']
    # Cost Per Impression
    df['CPI'] = df['Spent'] / df ['Impressions']

calculate_extra_columns(df3)
# df3

We can take a look at the results by filtering on the relevant columns and calling `head`.

In [23]:
df3[['Spent', 'Clicks', 'Impressions',
     'CTR', 'CPC', 'CPI']].head(3)

Unnamed: 0,Spent,Clicks,Impressions,CTR,CPC,CPI
0,227958,36243,500003,0.072486,6.289711,0.455913
1,411878,66083,500000,0.132166,6.232738,0.823756
2,640261,19145,500001,0.03829,33.442727,1.280519


Now, I want to verify the accuracy of the results manually for the first row:

In [24]:
clicks = df3['Clicks'][0]
impressions = df3['Impressions'][0]
spent = df3['Spent'][0]
CTR = df3['CTR'][0]
CPC = df3['CPC'][0]
CPI = df3['CPI'][0]

print('CTR:', CTR, clicks / impressions)
print('CPC:', CPC, spent / clicks)
print('CPI:', CPI, spent / impressions)

CTR: 0.0724855650866 0.0724855650866
CPC: 6.28971111663 6.28971111663
CPI: 0.45591326452 0.45591326452


(w_pacb45.pdf, page 296)

We are almost done with out `DataFrame`.
All we are missing now is a column that tells us the duration of the campaign and a column that tells us which day of the week corresponds to the start date of each campaign.
This allows me to expand on how to play with date objects.

In [25]:
def get_day_of_the_week(day):
    number_to_day = dict(enumerate(calendar.day_name, 1))
    return number_to_day[day.isoweekday()]

def get_duration(row):
    return (row['End'] - row['Start']).days

#get_day_of_the_week(date.today())
df4 = df3
df4['Day of Week'] = df4['Start'].apply(get_day_of_the_week)
df4['Duration'] = df4.apply(get_duration, axis=1)

df4

Unnamed: 0,Budget,Clicks,Impressions,cmp_name,Spent,user,Type,Start,End,Target Age,...,Email,Name,Gender,Age,Address,CTR,CPC,CPI,Day of Week,Duration
0,275259,36243,500003,GRZ_20161210_20171226_25-30_B_GBP,227958,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",GRZ,2016-10-12,2017-12-26,25-30,...,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346",0.072486,6.289711,0.455913,Wednesday,440
1,819336,66083,500000,AKX_20170303_20180810_20-25_B_GBP,411878,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",AKX,2017-03-03,2018-10-08,20-25,...,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346",0.132166,6.232738,0.823756,Friday,584
2,769975,19145,500001,AKX_20180330_20190215_45-55_M_EUR,640261,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",AKX,2018-03-30,2019-02-15,45-55,...,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346",0.038290,33.442727,1.280519,Friday,322
3,916268,23969,499997,KTR_20161209_20170827_25-35_M_USD,278581,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",KTR,2016-09-12,2017-08-27,25-35,...,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346",0.047938,11.622554,0.557165,Monday,349
4,358064,40327,499998,KTR_20180101_20180829_35-40_B_USD,299765,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",KTR,2018-01-01,2018-08-29,35-40,...,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346",0.080654,7.433357,0.599532,Monday,240
5,712666,42941,499999,GRZ_20181020_20200609_20-40_B_EUR,623051,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",GRZ,2018-10-20,2020-09-06,20-40,...,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346",0.085882,14.509466,1.246104,Saturday,687
6,514400,55587,499999,GRZ_20170427_20171010_20-40_B_EUR,351446,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",GRZ,2017-04-27,2017-10-10,20-40,...,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346",0.111174,6.322449,0.702893,Thursday,166
7,607171,28524,500001,KTR_20171127_20191103_40-45_F_GBP,569335,"{""username"": ""scottbeth"", ""name"": ""Beth Miller...",KTR,2017-11-27,2019-03-11,40-45,...,kara71@fisher.com,Beth Miller,F,85,"0535 Whitney River\nEast Timothy, AK 10346",0.057048,19.959858,1.138668,Monday,469
8,891496,70421,499999,BYU_20161215_20171126_35-45_M_GBP,81648,"{""username"": ""murphydaniel"", ""name"": ""John Yu ...",BYU,2016-12-15,2017-11-26,35-45,...,pbrown@bradley.com,John Yu PhD,M,53,"830 Wood Avenue Apt. 925\nCarolyntown, VI 8891...",0.140842,1.159427,0.163296,Thursday,346
9,697601,5557,499999,BYU_20170910_20190220_20-30_B_EUR,340443,"{""username"": ""murphydaniel"", ""name"": ""John Yu ...",BYU,2017-10-09,2019-02-20,20-30,...,pbrown@bradley.com,John Yu PhD,M,53,"830 Wood Avenue Apt. 925\nCarolyntown, VI 8891...",0.011114,61.263811,0.680887,Monday,499


#### Cleaning everything up

(w_pacb45.pdf, page 297)

In [26]:
final_columns = [
    'Type', 'Start', 'End', 'Duration', 'Day of Week', 'Budget',
    'Currency', 'Clicks', 'Impressions', 'Spent', 'CTR', 'CPC',
    'CPI', 'Target Age', 'Target Gender', 'Username', 'Email',
    'Name', 'Gender', 'Age'
]
df5 = df4[final_columns]

df5

Unnamed: 0,Type,Start,End,Duration,Day of Week,Budget,Currency,Clicks,Impressions,Spent,CTR,CPC,CPI,Target Age,Target Gender,Username,Email,Name,Gender,Age
0,GRZ,2016-10-12,2017-12-26,440,Wednesday,275259,GBP,36243,500003,227958,0.072486,6.289711,0.455913,25-30,B,scottbeth,kara71@fisher.com,Beth Miller,F,85
1,AKX,2017-03-03,2018-10-08,584,Friday,819336,GBP,66083,500000,411878,0.132166,6.232738,0.823756,20-25,B,scottbeth,kara71@fisher.com,Beth Miller,F,85
2,AKX,2018-03-30,2019-02-15,322,Friday,769975,EUR,19145,500001,640261,0.038290,33.442727,1.280519,45-55,M,scottbeth,kara71@fisher.com,Beth Miller,F,85
3,KTR,2016-09-12,2017-08-27,349,Monday,916268,USD,23969,499997,278581,0.047938,11.622554,0.557165,25-35,M,scottbeth,kara71@fisher.com,Beth Miller,F,85
4,KTR,2018-01-01,2018-08-29,240,Monday,358064,USD,40327,499998,299765,0.080654,7.433357,0.599532,35-40,B,scottbeth,kara71@fisher.com,Beth Miller,F,85
5,GRZ,2018-10-20,2020-09-06,687,Saturday,712666,EUR,42941,499999,623051,0.085882,14.509466,1.246104,20-40,B,scottbeth,kara71@fisher.com,Beth Miller,F,85
6,GRZ,2017-04-27,2017-10-10,166,Thursday,514400,EUR,55587,499999,351446,0.111174,6.322449,0.702893,20-40,B,scottbeth,kara71@fisher.com,Beth Miller,F,85
7,KTR,2017-11-27,2019-03-11,469,Monday,607171,GBP,28524,500001,569335,0.057048,19.959858,1.138668,40-45,F,scottbeth,kara71@fisher.com,Beth Miller,F,85
8,BYU,2016-12-15,2017-11-26,346,Thursday,891496,GBP,70421,499999,81648,0.140842,1.159427,0.163296,35-45,M,murphydaniel,pbrown@bradley.com,John Yu PhD,M,53
9,BYU,2017-10-09,2019-02-20,499,Monday,697601,EUR,5557,499999,340443,0.011114,61.263811,0.680887,20-30,B,murphydaniel,pbrown@bradley.com,John Yu PhD,M,53


Now our DataFrame is clean and ready for us to inspect.

### Saving the DataFrame to  file

TODO

### Visualizing the results

TODO

## Where do we go from here?

TODO

## Summary

TODO

# TODO

In [27]:
#%lsmagic
#%pwd
#%ls
#%pdb
#?range
#?%lsmagic
#%env
#?print