In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import seaborn as sns

In [2]:
df=pd.read_csv('../data/us_CID_activity_2014_2017.csv') # Read the data

In [3]:
#df.head() # Take a quick look

In [4]:
df.shape # Check out the size

(887709, 52)

In [5]:
#df.columns # Check out the columns

In [6]:
df.Country.unique() # Given that this is the US data, the NaNs are USA as well

array(['USA', nan], dtype=object)

In [7]:
df.Currency.unique() # Given that this is the US data, the NaNs are USD as well

array(['USD', nan], dtype=object)

In [8]:
df.Channel.unique() # Need to figure out what to do with NaNs

array(['Partner', 'DirectLocal', 'National', 'Deals', nan], dtype=object)

In [9]:
df.drop(columns=['Country', 'Currency', 'Advertiser_URL'], axis=1, inplace=True) # Useless data in this context
df.reset_index(inplace=True,drop=True)

In [10]:
#df.dtypes # Check the data types

In [11]:
df.TargetType.unique() # What's this column?

array(['Radius', 'Country', 'Metro', 'City', 'PostCode'], dtype=object)

In [12]:
#df.describe().T # Let's get a feel for the data --> There are campaign budgets equal to zero!

In [13]:
# Drop the "unnecessary" columns (for now)

df.drop(columns=['BC_ID','BusinessCategory', 'Primary_BSC_ID', 'Primary_BusinessSubCategory',
                'Secondary_BSC_Count','Secondary_BSC_IDs','Seconardy_BSCs','BusinessSpecialtyID',
                'BusinessSpecialty'], axis=1, inplace=True)
df.reset_index(inplace=True,drop=True)

## Let's look at the fishy stuff.

In [14]:
# CPL an CTL have exactly the same number of missing values. Check and see if this is because of the 
# product type.

# Check to see if they're are missing at the exact same locations:

print( (((np.isnan(df.CTL)) & (np.isnan(df.CPL))).sum()) == df.CTL.isna().sum() )

temp=df[['idadvertiser_master', 'idOffer', 'Offer_Name','CPL', 'CTL']]

temp[np.isnan(temp.CTL)]['idadvertiser_master'].nunique()


True


18545

In [15]:
df.idadvertiser_master.nunique()

38042

In [16]:
# Quite a lot of the data is missing from CPL and CLT. For about half of the 
# advertisers, this data is missing. Check what they are. For the time 
# being let's drop them. Same thing with CPC and CTR

In [17]:
df.drop(columns=['CTL', 'CPL','CPC', 'CTR'], axis=1, inplace=True)
df.reset_index(inplace=True,drop=True)

In [18]:
#df.isna().sum()

In [19]:
# Dealing with Offer, Finance Product, idOffer (There are only a few of them?)

missing_offer_ids=df[df.Offer_Name.isna()]['idadvertiser_master'].unique()

# Check to see if their are missing at the exact same locations:

df=df[~df.idadvertiser_master.isin(missing_offer_ids)]
df.reset_index(inplace=True,drop=True)

In [20]:
#df.isna().sum()

In [21]:
# Create a duration column. For missing values, one option is to replace with the average 
# campaign duration of a given advertiser--> Local!
# However we're not sure why the cycle is missing an end date. Is that churn?
df['Cycle_Started']=pd.to_datetime(df.Cycle_Started, format='%Y-%m-%d',  errors='ignore')
df['Cycle_Ended']=pd.to_datetime(df.Cycle_Ended, format='%Y-%m-%d',  errors='ignore')

my_index=df[df['Cycle_Ended'] < df['Cycle_Started']].index

temp=df.loc[my_index, 'Cycle_Ended']

df.loc[my_index, 'Cycle_Ended'] = df.loc[my_index, 'Cycle_Started']

df.loc[my_index, 'Cycle_Started']=temp


df.reset_index(inplace=True,drop=True)
df['cycle_duration']=pd.to_timedelta(df['Cycle_Ended']-df['Cycle_Started']).astype('timedelta64[D]')

In [22]:
#df[np.isnan(df.cycle_duration)][['idadvertiser', 'campaign_budget','Cycle_Started', 'Cycle_Ended','cycle_duration']]

In [23]:
df.reset_index(inplace=True,drop=True)

There are campaigns for which duration is NaN but it's because the campaign started on 2017-12-31 and that's the cut off for the dataset. Let's drop those.

In [24]:
# Get rid of the campaigns with zero budget and those starting at the end of the year.
df=df[~(df.campaign_budget==0)]
df=df[~(df.campaign_budget < 1)]

df=df[~(df.Cycle_Started==pd.to_datetime('2017-12-31 00:00:00'))]

df.reset_index(inplace=True,drop=True)

In [25]:
df[pd.isnull(df.Cycle_Started)]['idadvertiser_master'].nunique() # Two master_adv_ids don't have a start date. Drop.

2

In [26]:
ids=df[pd.isnull(df.Cycle_Started)]['idadvertiser_master'].unique()

df=df[~(df.idadvertiser_master.isin(ids))]

df.reset_index(inplace=True,drop=True)

In [27]:
df[pd.isnull(df.Cycle_Ended)]['idadvertiser_master'].nunique()/df.idadvertiser_master.nunique()*100

# 15% of the master_adv_ids have campaigns in them. Too much of the data. We need to impute this.

15.265992167101828

In [28]:
duration_mode=df.cycle_duration.mode() # Mode is 30. Let's impute the duration with this

In [29]:
#my_index=df[pd.isnull(df.cycle_duration)].index
#df.loc[my_index, 'cycle_duration']=duration_mode

df['cycle_duration'].fillna(duration_mode[0], inplace=True)

In [30]:
# Now reconstruct the end dates
my_index=df[pd.isnull(df.Cycle_Ended)].index
df.loc[my_index, 'Cycle_Ended']=df.loc[my_index, 'Cycle_Started']+pd.to_timedelta(str(duration_mode[0])+'D')


In [31]:
# Sort my master_adv_id and cycle start date then reset the main index.
# It's possible to do this with adv_id
#df.sort_values(by=['idadvertiser_master','Cycle_Started'])
df.reset_index(drop=True, inplace=True)
# Trim down the dataframe
temp=df[['idadvertiser_master','idadvertiser','Cycle_Started','Cycle_Ended','cycle_duration']]
# Find unique ids 
adv_ids=temp.idadvertiser_master.unique()

# Create additional columns
temp['delta']=0.0
temp['summation']=0.0

import scipy.stats as sts

seasonal=[]
period=[]
# Loop over ids
for idd in adv_ids:
    subset = temp[temp.idadvertiser_master==idd]
    subset.sort_values(by=['idadvertiser_master','Cycle_Started'],inplace=True)
    subset.reset_index(drop=True, inplace=True)
    
    if subset.shape[0]<2:
        seasonal.append(0)
        period.append(0)
    else:
    
        # Separating start and end
        start = subset.Cycle_Started
        end = subset.Cycle_Ended

        end_shift = end
        end_shift[1:]=end[0:-1]

        delta = start-end_shift
        subset['delta']=0.0
        for i in range(0,subset.shape[0]):
            subset.loc[i,'delta']=delta.iloc[i].days

        subset.loc[0,'delta']=0

        #subset['delta']=delta
        subset['summation']=subset['cycle_duration']+subset['delta']

        # Create the time series

        time_series= np.ones((1,int(subset['summation'].sum())))
        pivot = 0
        for i in range(0,subset.shape[0]):
            if subset.loc[i,'delta'] > 0:
                start_index=int(subset.loc[0:i,'summation'].sum() )
                end_index=start_index+int( subset.loc[i,'delta'] )
                time_series[0][start_index:end_index]=0
                pivot=i

        # Next largest power of 2        
        nfft=1<<(time_series.shape[1]-1).bit_length()

        y=np.fft.fft(time_series, n=nfft)
        y=abs(y**2)
        y=(y-y.min())
        y=y/y.max()

        s, p  = sts.kstest(y, 'uniform')

        if (p < 0.05) and (np.argmax(y)/nfft != 0): # the second argument is the frequency. 
            seasonal.append(1)
            period.append(1/np.argmax(y)/nfft)
        else:
            seasonal.append(0)
            period.append(0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice

In [32]:
with open('../data/seasonal.txt', 'w') as f:
    for item in seasonal:
        f.write("%s\n" % item)

In [33]:
with open('../data/periods.txt', 'w') as f:
    for item in period:
        f.write("%s\n" % item)