In [1]:
%load_ext watermark
%watermark -v -m -p pandas,numpy,scipy,faker -g

CPython 3.5.3
IPython 5.1.0

pandas 0.19.2
numpy 1.11.3
scipy 0.19.0
faker n

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 4.4.0-79-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit
Git hash   : e4d423ecd94684bb81768e5b7d1b692cd6a43a7b


In [2]:
number_people = 5000
number_actions = 500000

In [3]:
shows = {'title':['Velocity','HD','Punky Heels','Jack Wild','Duendes','Cant Buy Time'],
        'genre':['action','drama','comedy','reality','fantasy','sci-fi'],
        'description':['an action packed thriller about a pregnant woman stuck on a subway car that cannot slow down',
                      '35 year old Mike, struggles to reconnect with his birth family after being diagnosed with a genetic disorder',
                      'a romantic comedy about a young woman trying to find love in the city',
                      'a recent college grad trying to find himself learns the meaning of life while stuck in an alaskan blizzard',
                      'an unknown creature terrorizes the town of Salamanca, will they survive?',
                      'a group of friends gets stuck in a suburban shopping mall as the manifold of time and space collapses around them'
                      ],
        'show_id':[1,2,3,4,5,6]
        }

In [4]:
import pandas as pd

show_df = pd.DataFrame(shows)

In [5]:
show_df

Unnamed: 0,description,genre,show_id,title
0,an action packed thriller about a pregnant wom...,action,1,Velocity
1,"35 year old Mike, struggles to reconnect with ...",drama,2,HD
2,a romantic comedy about a young woman trying t...,comedy,3,Punky Heels
3,a recent college grad trying to find himself l...,reality,4,Jack Wild
4,an unknown creature terrorizes the town of Sal...,fantasy,5,Duendes
5,a group of friends gets stuck in a suburban sh...,sci-fi,6,Cant Buy Time


In [6]:
from faker import Faker
fake = Faker()
people = {'name': [],
          'sex': [],
          'bday': [],
          'p_id':[]
         }

for i in range(number_people):
    profile = fake.simple_profile(sex=None) 
    people['name'].append(profile['name'])
    people['sex'].append(profile['sex'])
    people['bday'].append(profile['birthdate'])
    people['p_id'].append(i)
    
people_df = pd.DataFrame(people)

In [7]:
from datetime import datetime, date, timedelta

def calculate_age(bday):
    bday = datetime.strptime(bday,'%Y-%m-%d')

    today = date.today()
    return today.year - bday.year - ((today.month, today.day) < (bday.month, bday.day))

people_df['age'] = people_df['bday'].apply(calculate_age)

In [8]:
import numpy as np

def generate_alpha_dict(x, y, num_out=1):
    output = {}
    if isinstance(y, list):
        low_y = y[0]
        high_y = y[1]
    else:
        low_y = 1
        high_y = y
        
    for i in range(x+1):
        output[i] = np.random.uniform(low_y,high_y, size=num_out)
    
    return output

In [9]:
import numpy as np

def per_row_draws(customer_df, draw_func, parameter_func, alpha):
    assert isinstance(customer_df, pd.DataFrame)
    
    output = []
    for i in customer_df.iterrows():
        draw_inputs = parameter_func(i, alpha)
        output.append(draw_func(*draw_inputs))
    
    return output

In [10]:
def filler_func(row, alpha):
    return []

# people_df['affinity_4_tv'] = per_row_draws(people_df, np.random.normal, filler_func, [])

In [11]:
def find_age_group(row, alpha):
    assert isinstance(alpha, dict)

    age = int(row[1]['age'] / 10)
    while True:
        if age in alpha:
            # a_out = alpha[age] + row[1]['affinity_4_tv']
            a_out = alpha[age]
            a_out = max(0.001, a_out)
            return [alpha[age], 5]
        else:
            age -= 1
        if age < 0:
            return 'alpha_dict broken!'


age_tv_watching_alphas = generate_alpha_dict(10,10)
people_df['watch_prob'] = per_row_draws(people_df, np.random.beta, find_age_group, age_tv_watching_alphas)

In [12]:
def find_show_probability(row, alpha_dict):  
    male_dict = alpha_dict[0]
    female_dict = alpha_dict[1]
    gender = row[1]['sex']
    if gender == 'M':
        alpha = male_dict
    elif gender == 'F':
        alpha = female_dict
    else:
        return 'i am outdated and only do binary gender'
    
    age = int(row[1]['age'] / 10)
    while True:
        if age in alpha:
            return [alpha[age]]
        else:
            age -= 1
        if age < 0:
            return 'alpha_dict broken!'

In [13]:
female_alphas = generate_alpha_dict(10,10, num_out=show_df.shape[0])
male_alphas = generate_alpha_dict(10,10, num_out=show_df.shape[0])

alpha = [male_alphas, female_alphas]

people_df['show_prob'] = per_row_draws(people_df, np.random.dirichlet, find_show_probability, alpha)

In [14]:
def create_use_df(customer_df, show_df, size=1000, person_id = 'p_id'):
    behavior_dict = {'show_id':[], 'p_id':[]}
    shows = show_df['show_id'].values
    
    while len(behavior_dict['show_id']) < size:
        person = customer_df[customer_df[person_id] == np.random.choice(customer_df[person_id])] 
        
        if np.random.binomial(1, person.watch_prob.values[0]):
            show_watched = shows[np.where(np.random.multinomial(1, person.show_prob.values[0], size=1)[0])[0][0]]
            behavior_dict['p_id'].append(person.p_id.values[0])
            behavior_dict['show_id'].append(show_watched)
    
    return behavior_dict

In [15]:
behavioral_df = pd.DataFrame(create_use_df(people_df, show_df, size=number_actions))

In [16]:
def create_person_alpha_dict(behavioral_df, show_df):
    output = {}
    genre_proclivity = np.random.rand(show_df.shape[0])
    shows = show_df['show_id'].values    
        
    grouped_df = pd.merge(behavioral_df, show_df, on='show_id')\
                    .groupby(['p_id','show_id'])['title'].count()\
                    .unstack().reset_index().fillna(0)
    grouped_df['total'] = grouped_df[shows].sum(axis=1)
    grouped_df['total'] = grouped_df['total']/grouped_df['total'].max()
    grouped_df[shows] = grouped_df[shows].apply(lambda x: x/sum(x), axis=1)
    for row in grouped_df.iterrows():
        output[row[1]['p_id']] = sum(row[1][shows].values*genre_proclivity, row[1]['total'])*5
    return output

In [None]:
def signup_probability(row, alpha_dict):
    if row[1]['p_id'] not in alpha_dict:
        return [0.001, 5]
    else:
        return [alpha_dict[row[1]['p_id']], 5]

alpha = create_person_alpha_dict(behavioral_df, show_df)
people_df['subscriber_prob'] = per_row_draws(people_df, np.random.beta, signup_probability, alpha)

In [None]:
def create_sub_alpha_dict(people_df):
    output = {}
    for row in people_df.iterrows():
        output[row[1]['p_id']] = row[1]['subscriber_prob']
    return output

def signup_probability(row, alpha_dict):
    assert row[1]['p_id'] in alpha_dict
    return [1, alpha_dict[row[1]['p_id']]]

alpha = create_sub_alpha_dict(people_df)
people_df['subscriber'] = per_row_draws(people_df, np.random.binomial, signup_probability, alpha)

In [None]:
people_df.head()

make sure that I have some class imbalance

In [None]:
np.mean(people_df.subscriber)

In [None]:
people_df = people_df[['bday','name','p_id','sex','subscriber']]
people_df.head()

In [None]:
def change_column(column, change_func, change_rate=0.001):
    output = []
    count = 0
    for i in column:
        if np.random.rand() <= change_rate:
            count += 1
            i = change_func(i)
        output.append(i)
    
    print(count)
    return output

def change_bday(bday):
    bday = datetime.strptime(bday,'%Y-%m-%d')-timedelta(days=365*500)
    bday = bday.strftime('%Y-%m-%d')
    return bday

def change_key(key):
    return key+np.random.randint(10000,100000)

people_df['bday'] = change_column(people_df['bday'],change_bday)
behavioral_df['p_id'] = change_column(behavioral_df['p_id'],change_key,change_rate=0.0002)
behavioral_df['show_id'] = change_column(behavioral_df['show_id'],change_key,change_rate=0.0002)

In [None]:
people_df.to_csv('subscribers.csv', index=False)

In [None]:
behavioral_df.to_csv('actions.csv', index=False)

In [None]:
show_df.to_csv('shows.csv', index=False)