## Lookalike Modeling Test

Author: James Fung  
Target: Cat Food Purchasers

In [1]:
#Load libraries.

from IPython.display import clear_output
import timeit

import pandas as pd
import seaborn as sb
import numpy as np

#Figure size.
from matplotlib import rcParams
# figure size in inches
rcParams['figure.figsize'] = 7,5

#Reduction techniques.
from sklearn.decomposition import PCA

import gc

In [2]:
data = pd.read_csv("/Users/james.fung/Desktop/Test Scripts/Cat Food Model/catdata.csv")

### Data Pre-Processing

In [3]:
#Drop useless columns.

data2 = data.drop(['household_idl','latitude','longitude','censustract','censusblock','fipscounty','dma','cbsa','update_identifier'],axis=1)

In [4]:
data2.head()

Unnamed: 0,target,attribute_name,attribute_value,idl_id,fipsstate,age_years,gender,marital_status,presence_child,num_adults,num_children,hh_size,income_level,head_hh_educ,own_rent,home_lor
0,0,8441_8,1,XY1324-aSCj2ZDHYbkRaErWMfEQAXdbZ4rw4B7qsyd2RxvtsQ,8.0,44.0,F,S,Y,2.0,3.0,5.0,9.0,3.0,O,3.0
1,0,8561_9,1,XY1324-aSCj2ZDHYbkRaErWMfEQAXdbZ4rw4B7qsyd2RxvtsQ,8.0,44.0,F,S,Y,2.0,3.0,5.0,9.0,3.0,O,3.0
2,0,8663_9,1,XY1324-aSCj2ZDHYbkRaErWMfEQAXdbZ4rw4B7qsyd2RxvtsQ,8.0,44.0,F,S,Y,2.0,3.0,5.0,9.0,3.0,O,3.0
3,0,7616_44,1,XY1324-aSCj2ZDHYbkRaErWMfEQAXdbZ4rw4B7qsyd2RxvtsQ,8.0,44.0,F,S,Y,2.0,3.0,5.0,9.0,3.0,O,3.0
4,0,7628_2,1,XY1324-aSCj2ZDHYbkRaErWMfEQAXdbZ4rw4B7qsyd2RxvtsQ,8.0,44.0,F,S,Y,2.0,3.0,5.0,9.0,3.0,O,3.0


In [5]:
#Drop duplicates?

data3 = data2.drop_duplicates(subset=['attribute_name','idl_id'])

In [6]:
#Spread df.

attpivot = data3.pivot(index='idl_id',columns='attribute_name',values='attribute_value')
attpivot = attpivot.fillna(value=0)

In [7]:
#Combine back to original and drop attribute_name and value. Remove duplicates.

data3 = data3.drop_duplicates()
data3 = data3.drop(['attribute_name','attribute_value'],axis=1)
data4 = data3.merge(attpivot,on='idl_id')

#Delete old dataframes to reduce memory useage???

del [[data,data2,data3]]
gc.collect()

28

In [None]:
#How many columns and rows are there?
print('Columns:' + str(len(list(data4))))
print('Observations:' + str(len(data4)))

In [None]:
data4.head()

In [None]:
data4['target'] = data4['target'].astype('category')

### Exploratory Data Analysis

Let's check out some of the demographic data.

In [None]:
#Is there a difference in distribution in age?

sb.boxplot(x='target',y='age_years',data=data4)

In [None]:
#Gender?

sb.countplot(x='target',hue='gender',data=data4)

In [None]:
list(data4)

### Dimensionality Reduction - Attributes

Can the 400+ attributes be represented in a smaller dimension? I assume many of these are interrelated.

In [None]:
# set up a PCA learner
pca = PCA(n_components = 10)
eigen =  pca.fit(data4.iloc[:,14:])
cat2d = eigen.transform(data4.iloc[:,14:])

In [None]:
# let's also look at how much of the total variance we were able to cover with 10 dimensions.
print('percentage of variance explained:', sum(pca.explained_variance_ratio_))

In [None]:
sb.scatterplot(cat2d[:,0],cat2d[:,1], hue = data4['target']).set_title('2d Acxiom Attributes')

It's clear from above that the Acxiom attributes cannot be represented in 2-d, as only 23% of the variation is explained through 10 components.

Let's pick a number, say k components where k = number of Acxioma attributes and then examine a screeplot.

In [None]:
pca = PCA(n_components = len(list(data4.iloc[:,14:])))
eigen =  pca.fit(data4.iloc[:,14:])

In [None]:
import matplotlib.pyplot as plt

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
sum(pca.explained_variance_ratio_[0:600])

600 components can explain 90% of the variation in the data. This is much better than feeding all 2500 acxiom attributes into the data.

### Demographic Data Conversion

In [None]:
data4.iloc[0:10,0:13]

In [None]:
#Drop ID.
data4 = data4.drop(['idl_id'],axis=1)

#Encode categorical variables.
one_hot = pd.get_dummies(data4,columns=['gender','marital_status','presence_child','own_rent'])

In [None]:
list(data4)

### Trash Section

In [None]:
#Tune the n_component parameter.

n = []
nvar = []
maxcomp=50

start = timeit.default_timer()

for i in range(2,maxcomp):
    
    clear_output(wait=True)
    
    pca = PCA(n_components = i)
    eigen =  pca.fit(data4.iloc[:,14:])
    totalvar = sum(pca.explained_variance_ratio_)
    n.append(i)
    nvar.append(totalvar)
    
    stop = timeit.default_timer()
    
    if(i/maxcomp*100) <5:
        expected_time = "Calculating..."
    
    else:
        time_perc = timeit.default_timer()
        expected_time = np.round(((time_perc-start)/(i/maxcomp))/60,2)
    
    print("Current progress:", np.round(i/maxcomp*100,2),"%")
    print("Current run time:", np.round((stop-start)/60,2),"minutes")