In [259]:
%reload_ext ishbook
import pandas as pd
import numpy as np
import datetime as dt
import time
import iql
import plus
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [260]:
## pull out all contact web forms submitted for Month of August 
## group by advid, activity_id to feed in to next query, activity type for confirm its a CWF, and unixtime

%%ish

cwfs = from advertiserevent 2016-08-01 2016-09-01  WHERE activity_type=CONTACT_WEB_FORM GROUP BY advertiser_id, activity_id, activity_type, unixtime/*CWF Submitted*/
output = None

In [261]:
## drop the last column to make the merging cleaner

cwfs.drop(cwfs.columns[[4]], axis = 1, inplace = True)
cwfs.tail()

Unnamed: 0,advertiser_id,activity_id,activity_type,CWF Submitted
20556,2998129,916149024,CONTACT_WEB_FORM,1472708817
20557,8551842,916163148,CONTACT_WEB_FORM,1472708929
20558,5988505,916188920,CONTACT_WEB_FORM,1472709141
20559,7228804,916206445,CONTACT_WEB_FORM,1472709314
20560,8740393,916225206,CONTACT_WEB_FORM,1472709480


In [262]:
## create a list of activity ids from the above query to feed into the adcresolvedevents queue to get all the CWFs that were resolved


activity_id = ','.join(str(x) for x in cwfs.activity_id)

In [263]:
## pull all CWFs for same time range August 2016 where the activity id matches to get the time it was resolved

%%ish

resolved_cwfs = from adcresolvedevents 2016-08-01 2016-09-01 where activity_id IN ({activity_id}) group by advertiser_id ,activity_id, activity_type, unixtime/*Resolved Time*/
output = None

In [264]:
## same thing, clean up the df so that merging is nicer

resolved_cwfs.drop(resolved_cwfs.columns[[4]], axis = 1, inplace = True)

In [265]:
## gather some overall summary statistics, how many total CWFs were submitted, how many we resovled, what was the percentage resolution

print len(cwfs)
print len(resolved_cwfs)
print float(len(resolved_cwfs))/float(len(cwfs)) * 100

20561
16099
78.2987208793


In [282]:
## merge the two created dfs together to get a holistic picture of when a CWF was submitted to its resolution time
## make sure the activity_id match - I merge on the other two columns just so that I don't have to drop them after the fact
## activity_type and advertiser_id should match regardless - the activity_id is my key differentiator / unique qualifier

overall_cwfs_df = cwfs.merge(resolved_cwfs, how = "left", on = ["activity_id", "activity_type", "advertiser_id"])

In [365]:
## filled any NaN resolved times, so any CWFs that were submitted that did NOT get resolved get marked as "UNRESOLVED"
## I don't wan't to do this step until the very end - some of the elements down below need the resolved time field to be marked null
#overall_cwfs_df["Resolved Time"] = overall_cwfs_df["Resolved Time"].fillna("UNRESOLVED")

In [368]:
## just using this to get a count of how many unique advertisers submitted a CWF for that month

len(overall_cwfs_df.groupby("advertiser_id")["advertiser_id"].unique())

18361

In [269]:
## create a list of advertiser ids from the overall df to feed into the crmcalls index

advids = ','.join(str(x) for x in overall_cwfs_df.advertiser_id)

In [271]:
## grab all the INBOUND calls for the month of August 2016 to get when the advertiser would have called in that month

%%ish

calls = from crmcalls 2016-08-01 2016-09-01 where client_id IN ({advids}) direction = "Inbound" group by client_id/*advertiser_id*/, certainty, direction, talk_time_seconds/*Talk Time (s)*/, rep_username, unixtime/*Call Time*/
output = None

In [272]:
## again, clean up the columns to make the merge easier

calls.drop(calls.columns[[6]], axis = 1, inplace = True)
calls.head()

Unnamed: 0,advertiser_id,certainty,direction,Talk Time (s),rep_username,Call Time
0,8151762,certain (only one contact),Inbound,29,jsyc,1470039845
1,8222402,certain (only one contact),Inbound,114,claireh,1470040753
2,1575597,"uncertain (multiple contacts, no note in 30 mi...",Inbound,282,cathal,1470041696
3,7773670,certain (only one contact),Inbound,259,,1470055807
4,143585,certain (only one contact),Inbound,906,adriana,1470055851


In [273]:
## print the length of the call df - relatively small

print len(calls)

1227


In [283]:
## merge the calls df to the overall CWFs df
## for next iteration of this notebook, ideally I should calculate if the call time is BEFORE "CWF submitted" Time and exclude those
## we don't want advertisers who called before they submitted the CWF
## that will be for v2 if needed

allcwfs_calls = overall_cwfs_df.merge(calls, how = "left", on = "advertiser_id")

In [369]:
allcwfs_calls.head()

Unnamed: 0,advertiser_id,activity_id,activity_type,CWF Submitted,Resolved Time,certainty,direction,Talk Time (s),rep_username,Call Time
0,371022,855039664,CONTACT_WEB_FORM,1470031607,,,,,,
1,8256738,855041318,CONTACT_WEB_FORM,1470032031,1470125000.0,,,,,
2,8262996,855066260,CONTACT_WEB_FORM,1470032417,1470095000.0,,,,,
3,8249625,855076524,CONTACT_WEB_FORM,1470033378,1470097000.0,,,,,
4,7927214,855077270,CONTACT_WEB_FORM,1470033605,1470092000.0,,,,,


In [342]:
## make a copy of my master df that contains all CWFs submitted for August 2016 with call data attached
## however, I am ONLY including unresolved CWFs by making sure that the "Resolved Time" field is null

unresolved = allcwfs_calls[allcwfs_calls["Resolved Time"].isnull()].copy()

In [343]:
## Setting all the null values for the unresolved time column to "UNRESOLVED" because guess what? They're unresolved

unresolved["Resolved Time"] = unresolved["Resolved Time"].fillna("UNRESOLVED")

In [344]:
## I'm dropping any additional NaN values because I only really care about those CWFs that I could log a phone call for
## This will shorten the df significantly

unresolved = unresolved.dropna()

In [345]:
## I used this to try and create a string list of the column names - I don't really need this

column_names = [str(x) for x in unresolved.columns.values]
print column_names

['advertiser_id', 'activity_id', 'activity_type', 'CWF Submitted', 'Resolved Time', 'certainty', 'direction', 'Talk Time (s)', 'rep_username', 'Call Time']


In [346]:
## show me the length of the dataframe, as well as the last 5 rows
## again, need to filter out the call times > CWF submitted - can't use those in my counts

print len(unresolved)
unresolved.tail()

439


Unnamed: 0,advertiser_id,activity_id,activity_type,CWF Submitted,Resolved Time,certainty,direction,Talk Time (s),rep_username,Call Time
20759,8442209,915565241,CONTACT_WEB_FORM,1472675784,UNRESOLVED,certain (only one contact),Inbound,293.0,laurenb,1471279000.0
20760,8442209,915565241,CONTACT_WEB_FORM,1472675784,UNRESOLVED,certain (only one contact),Inbound,537.0,laurenb,1471378000.0
20764,2075408,915567571,CONTACT_WEB_FORM,1472675942,UNRESOLVED,"uncertain (multiple contacts, no note in 30 mi...",Inbound,363.0,chanon,1470751000.0
20847,1555831,915715595,CONTACT_WEB_FORM,1472686268,UNRESOLVED,certain (only one contact),Inbound,678.0,sinclair,1472495000.0
20848,1555831,915715595,CONTACT_WEB_FORM,1472686268,UNRESOLVED,certain (only one contact),Inbound,932.0,joshs,1472659000.0


In [347]:
## create a column to do the comparison between when the call was logged compared to when the CWF was submitted -  this is my first delta
## just subtract the two unixtime fields from each other since they are both seconds

unresolved["Call Time Comparison"] = unresolved["Call Time"] - unresolved["CWF Submitted"]

In [348]:
## use this function to create date-time stamps for the unixtime fields - just to make the data much more readable

unresolved["Call Time"] = [dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in unresolved["Call Time"]]
unresolved["CWF Submitted Timestamp"] = [dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in unresolved["CWF Submitted"]]

In [349]:
## clean up the df to only include the columns I care about - removing the reps associated and certainty fields
## reorganize them so they are more readable

unresolved_clean = unresolved[["advertiser_id", "activity_id", "activity_type", "CWF Submitted Timestamp", "Resolved Time", 
             "Call Time", "Call Time Comparison", "direction", "Talk Time (s)"]]

In [370]:
unresolved_clean.head()

Unnamed: 0,advertiser_id,activity_id,activity_type,CWF Submitted Timestamp,Resolved Time,Call Time,Call Time Comparison,direction,Talk Time (s)
197,8266788,856072578,CONTACT_WEB_FORM,2016-08-01 09:15:13,UNRESOLVED,2016-08-01 09:32:02,1009.0,Inbound,148.0
202,8000825,856101518,CONTACT_WEB_FORM,2016-08-01 09:20:28,UNRESOLVED,2016-08-01 11:27:44,7636.0,Inbound,672.0
209,1084212,856131028,CONTACT_WEB_FORM,2016-08-01 09:25:01,UNRESOLVED,2016-08-04 09:15:10,258609.0,Inbound,128.0
217,7200761,856156009,CONTACT_WEB_FORM,2016-08-01 09:36:31,UNRESOLVED,2016-08-01 09:31:44,-287.0,Inbound,554.0
240,304040,856179607,CONTACT_WEB_FORM,2016-08-01 09:57:20,UNRESOLVED,2016-08-23 10:41:42,1903462.0,Inbound,846.0


In [360]:
## the big KAHUNA - grab the HIGHEST value for the call time comparison field - this will get me all the call times that happened after 
## the CWFs were submitted - this might be affected if I modify it to only grab call times < cwf submitted
## however, the idea is we are doing a groupby the advertiser and activity ids - I need to get only the activity
## ids with the call time AFTER the CWF was submitted
## rename the Call Time Comparison column to SHORTEST - this will be how I reference the column that i want
## merge the group df back to the orignal one - and only return the columns where they are the shortest

unresolved_grouped = unresolved_clean.groupby(["advertiser_id", "activity_id"]).agg({"Call Time Comparison" : max}).reset_index()
unresolved_grouped = unresolved_grouped.rename(columns={"Call Time Comparison" : "SHORTEST"})
unresolved_final = pd.merge(unresolved_clean, unresolved_grouped, how = "left", on = ["advertiser_id", "activity_id"])
unresolved_final = unresolved_final[unresolved_final["Call Time Comparison"] == unresolved_final["SHORTEST"]]

In [362]:
unresolved_final.to_clipboard()

In [308]:
## run through similar steps as the above df - except now we're looking at resolved CWFs instead
## print the length of the df and the last 5 rows - just checking work

resolved = allcwfs_calls[~allcwfs_calls["Resolved Time"].isnull()].copy()
print len(resolved)
resolved.tail()

16359


Unnamed: 0,advertiser_id,activity_id,activity_type,CWF Submitted,Resolved Time,certainty,direction,Talk Time (s),rep_username,Call Time
20908,8739728,915823883,CONTACT_WEB_FORM,1472699706,1472701000.0,,,,,
20909,8739681,915824635,CONTACT_WEB_FORM,1472699864,1472707000.0,,,,,
20911,8590995,915828891,CONTACT_WEB_FORM,1472700818,1472709000.0,,,,,
20916,616321,915833942,CONTACT_WEB_FORM,1472702289,1472703000.0,,,,,
20942,8739662,915898601,CONTACT_WEB_FORM,1472706600,1472708000.0,,,,,


In [309]:
## this step is kind of redundant - I can subtract the CWF & Resolved time fields even if one is a float and one is an int.

resolved["Resolved Time"] = resolved["Resolved Time"].astype(int)

In [310]:
## calculate how long it took to resolve the CWF - since its two unixtimes being subtracted from each other
## it equates to seconds - I could probably make this minutes or hours in a separate column 
## OR do it in place of 

resolved["Time to Resolve CWF (s)"] = resolved["Resolved Time"] - resolved["CWF Submitted"]

In [312]:
resolved.head()

Unnamed: 0,advertiser_id,activity_id,activity_type,CWF Submitted,Resolved Time,certainty,direction,Talk Time (s),rep_username,Call Time,Time to Resolve CWF (s)
1,8256738,855041318,CONTACT_WEB_FORM,1470032031,1470124983,,,,,,92952
2,8262996,855066260,CONTACT_WEB_FORM,1470032417,1470094573,,,,,,62156
3,8249625,855076524,CONTACT_WEB_FORM,1470033378,1470096553,,,,,,63175
4,7927214,855077270,CONTACT_WEB_FORM,1470033605,1470092446,,,,,,58841
5,7562678,855078444,CONTACT_WEB_FORM,1470033881,1470125342,,,,,,91461


In [313]:
## look at the Call Times compared to the Resolved Times - again this is going to be my time delta
## again, this might be affected AFTER I pull out call times that were BEFORE the CWF being submitted

resolved["Call Time Comparison"] = resolved["Call Time"] - resolved["Resolved Time"]

In [314]:
## make a copy of the df and drop all NaN values - so get rid of any columns that don't have a call log associated with them

final_resolved = resolved.dropna().copy()

In [316]:
## create better looking timestamps for my timestampe columns

final_resolved["Call Time"] = [dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in final_resolved["Call Time"]]
final_resolved["CWF Submitted"] = [dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in final_resolved["CWF Submitted"]]
final_resolved["Resolved Time"] = [dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in final_resolved["Resolved Time"]]

In [371]:
final_resolved.head()

Unnamed: 0,advertiser_id,activity_id,activity_type,CWF Submitted,Resolved Time,certainty,direction,Talk Time (s),rep_username,Call Time,Time to Resolve CWF (s),Call Time Comparison
19,8151762,855122038,CONTACT_WEB_FORM,2016-08-01 04:01:49,2016-08-01 05:18:12,certain (only one contact),Inbound,29.0,jsyc,2016-08-01 04:24:05,4583,-3247.0
53,8264163,855177368,CONTACT_WEB_FORM,2016-08-01 05:19:10,2016-08-02 03:49:48,certain (only one contact),Inbound,117.0,moritah,2016-08-02 22:55:10,81038,68722.0
128,5735070,855466541,CONTACT_WEB_FORM,2016-08-01 07:42:48,2016-08-01 08:44:21,certain (only one contact),Inbound,80.0,timothy,2016-08-12 15:54:44,3693,976223.0
161,8161725,855833384,CONTACT_WEB_FORM,2016-08-01 08:36:51,2016-08-01 09:06:17,certain (only one contact),Inbound,15.0,elliez,2016-08-01 12:05:31,1766,10754.0
196,4749042,856065864,CONTACT_WEB_FORM,2016-08-01 09:14:06,2016-08-01 09:24:15,certain (only one contact),Inbound,133.0,ajacobs,2016-08-17 13:22:44,609,1396709.0


In [320]:
## clean up my df so that it only has the columns that I really care about

clean_final = final_resolved[["advertiser_id", "activity_id", "activity_type", "CWF Submitted", "Resolved Time", 
               "Call Time", "Time to Resolve CWF (s)", "Call Time Comparison", "direction", "Talk Time (s)"]]

In [326]:
clean_final.head()

Unnamed: 0,advertiser_id,activity_id,activity_type,CWF Submitted,Resolved Time,Call Time,Time to Resolve CWF (s),Call Time Comparison,direction,Talk Time (s)
19,8151762,855122038,CONTACT_WEB_FORM,2016-08-01 04:01:49,2016-08-01 05:18:12,2016-08-01 04:24:05,4583,-3247.0,Inbound,29.0
53,8264163,855177368,CONTACT_WEB_FORM,2016-08-01 05:19:10,2016-08-02 03:49:48,2016-08-02 22:55:10,81038,68722.0,Inbound,117.0
128,5735070,855466541,CONTACT_WEB_FORM,2016-08-01 07:42:48,2016-08-01 08:44:21,2016-08-12 15:54:44,3693,976223.0,Inbound,80.0
161,8161725,855833384,CONTACT_WEB_FORM,2016-08-01 08:36:51,2016-08-01 09:06:17,2016-08-01 12:05:31,1766,10754.0,Inbound,15.0
196,4749042,856065864,CONTACT_WEB_FORM,2016-08-01 09:14:06,2016-08-01 09:24:15,2016-08-17 13:22:44,609,1396709.0,Inbound,133.0


In [324]:
shortest = clean_final.groupby(["advertiser_id", "activity_id"]).agg({"Call Time Comparison" : min}).reset_index()
shortest = shortest.rename(columns={"Call Time Comparison" : "SHORTEST"})
df = pd.merge(clean_final, shortest, how = "left", on = ["advertiser_id", "activity_id"])
df = df[df["Call Time Comparison"] == df["SHORTEST"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Call Time Comparison
advertiser_id,activity_id,Unnamed: 2_level_1
10342,898918222,-6783.0
22759,857261359,-7153.0
24126,856404752,5398.0
25351,888083784,-63984.0
25921,875926414,-16982.0
42189,915097151,-3931.0
66799,862813890,-8634.0
66799,884133501,-1027624.0
69832,915081196,3211.0
91725,876572023,-870048.0


In [327]:
shortest = clean_final.groupby(["advertiser_id", "activity_id"]).agg({"Call Time Comparison" : min}).reset_index()
shortest = shortest.rename(columns={"Call Time Comparison" : "SHORTEST"})
df = pd.merge(clean_final, shortest, how = "left", on = ["advertiser_id", "activity_id"])
df = df[df["Call Time Comparison"] == df["SHORTEST"]]

In [335]:
df.to_clipboard()