In [1]:
import numpy as np
import pandas as pd
import random
from scipy.stats import dirichlet, expon, uniform, norm
from faker import Faker
fake = Faker()

In [2]:
# Define some global parameters
N_DAYS = 365*2
N_DAYS_PERIOD_0 = 365./p
N_AUTHORS = 50
N_USERS = 1000

# Define possible topics and baseline probabilities // right now all made up, but should be based on data eventually
topics = ['Opinion', 'Politics', 'World Events', 'Business', 'Technology', 'Arts & Culture', 'Sports', 'Health', 'Home', 'Travel', 'Fashion', 'Food']
topics_probs = np.array([[0.1, 0.1, 0.1, 0.1, 0.08, 0.08, 0.08, 0.08, 0.08, 0.07, 0.07, 0.06]])

# Generate fake authors and probability vectors
authors_names = [fake.name() for i in range(N_AUTHORS)]
authors_quality = [uniform.rvs()*10 for i in range(N_AUTHORS)]

alphas = np.ones(len(topics)) # Concentration parameters TBD, right now uniform
authors_topicsprobs = np.array([dirichlet.rvs(alphas)[0] for auth in authors_names])
authors_popularity = dirichlet.rvs(np.ones(N_AUTHORS)*10)[0]

In [4]:
events = []
articles = []

# Iterate over days to generate all articles
for day in range(N_DAYS):

    # Generate new event with some probability (TBD)
    for _ in range(0, int(norm.rvs(loc=1, scale=2))):
        event_influence = dirichlet.rvs(alphas) # Concentration parameters TBD
        event_duration = expon.rvs(loc=0.01, scale=0.1) # some events should be long-lived
        duration_days = np.ceil(event_duration * 30)
        # hmm, event duration and intensity is generally correlated
        event_intensity = expon.rvs(scale=0.1, loc=0.1)
        events.append({
            'id': len(events),
            'start': day,
            'influence': event_influence,
            'duration': duration_days,
            'end': day + duration_days, # added for clarity of filtering
            'intensity': event_intensity
        })

    # Loop through events and sum topic probabilities of all active events
    day_topicsprobs = topics_probs.copy()
    for event in events:
        if event['start'] + event['duration'] >= day:
            day_topicsprobs += event['intensity']*event['influence']

    # Normalize probabilities
    day_topicsprobs = day_topicsprobs[0] / day_topicsprobs[0].sum()

    # Generate articles for day
    n_articles = round(norm.rvs(loc=N_AUTHORS / 3, scale=N_AUTHORS / 10))
    for i in range(n_articles):
        article_topic = np.random.choice(topics, p=day_topicsprobs)
        # Normalize probabilities
        author_probs = authors_topicsprobs[:, topics.index(article_topic)]
        author_probs = author_probs / author_probs.sum()
        article_author = np.random.choice(authors_names, p=author_probs)
        articles.append({'id': len(articles), 'day': day, 'topic': article_topic, 'author': article_author})

events_df = pd.DataFrame(events)
articles_df = pd.DataFrame(articles)

events_df.head()

Unnamed: 0,id,start,influence,duration,end,intensity
0,0,1,"[[0.12234376832733072, 0.001964583679954783, 0...",6.0,7.0,0.292969
1,1,1,"[[0.0030623739706689477, 0.0754415811362824, 0...",12.0,13.0,0.127128
2,2,1,"[[0.01879711794708143, 0.00650361914890975, 0....",1.0,2.0,0.175085
3,3,3,"[[0.12171037086685811, 0.05734951520829281, 0....",6.0,9.0,0.13306
4,4,3,"[[0.08286288947559928, 0.15962724905753833, 0....",5.0,8.0,0.291854


In [79]:
articles_df.describe()

Unnamed: 0,id,day
count,12104.0,12104.0
mean,6051.5,366.768011
std,3494.268164,211.82045
min,0.0,0.0
25%,3025.75,183.0
50%,6051.5,366.0
75%,9077.25,553.0
max,12103.0,729.0


In [80]:
articles_df.groupby('topic').describe()

Unnamed: 0_level_0,id,id,id,id,id,id,id,id,day,day,day,day,day,day,day,day
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
topic,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Arts & Culture,1017.0,6044.251721,3548.507617,3.0,2952.0,5959.0,9121.0,12097.0,1017.0,366.26942,215.038447,0.0,178.0,360.0,555.0,728.0
Business,1137.0,6248.978892,3498.184728,2.0,3310.0,6368.0,9315.0,12095.0,1137.0,378.62533,212.033814,0.0,197.0,384.0,567.0,728.0
Fashion,943.0,6094.290562,3447.411392,5.0,3176.0,6208.0,8972.0,12093.0,943.0,369.406151,209.060479,0.0,189.5,375.0,546.5,728.0
Food,870.0,6101.108046,3464.369153,4.0,3051.5,6045.0,9127.5,12063.0,870.0,369.836782,210.146091,0.0,184.25,365.0,556.0,727.0
Health,1025.0,6140.932683,3466.674931,43.0,3338.0,6193.0,8994.0,12103.0,1025.0,372.102439,210.169124,2.0,199.0,374.0,548.0,729.0
Home,1000.0,5821.739,3487.331316,9.0,2907.0,5632.0,8755.5,12049.0,1000.0,352.774,211.43876,0.0,175.75,341.0,532.0,726.0
Opinion,1105.0,6154.928507,3487.509211,12.0,3201.0,6237.0,9250.0,12101.0,1105.0,373.030769,211.488965,0.0,191.0,377.0,563.0,729.0
Politics,1102.0,6030.38294,3516.276324,11.0,2972.75,6094.5,9194.75,12094.0,1102.0,365.480944,213.174094,0.0,180.0,368.5,559.75,728.0
Sports,918.0,5956.438998,3488.724652,16.0,2803.25,6026.5,8989.0,12102.0,918.0,361.017429,211.446877,0.0,170.25,364.5,547.25,729.0
Technology,997.0,5948.811434,3563.887505,1.0,2803.0,5731.0,9090.0,12099.0,997.0,360.53661,215.887349,0.0,170.0,346.0,553.0,729.0


In [81]:
# Generate users
users = []
# will depend on how calculations are done

for user in range(N_USERS):
    ip = fake.ipv4() # not sure if these first two are necessary
    agent = fake.user_agent()
    prefs = dirichlet.rvs(alphas)
    freq = int(norm.rvs(loc=5, scale=5)) # should be bimodal, will investigate
    first_day = int(uniform.rvs()*N_DAYS)
    lifetime = int(expon.rvs(loc=20, scale=200))
    ad_sensitivity = norm.rvs(loc=3, scale=1)

    num_favorite = np.ceil(uniform.rvs()*4)
    favorites = random.sample(authors_names, int(num_favorite))
    users.append({'ip': ip, 'agent': agent, 'prefs': prefs, 'freq': max(freq, 0),
                  'first_day': first_day, 'lifetime': lifetime, 'ad_sensitivity': ad_sensitivity,
                  'favorites': favorites})

In [82]:
users_df = pd.DataFrame(users)
users_df

Unnamed: 0,ip,agent,prefs,freq,first_day,lifetime,ad_sensitivity,favorites
0,197.253.32.69,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,"[[0.110634978829198, 0.05504285014569291, 0.04...",14,15,96,2.787714,"[Alexis Francis, Tammy Shepard, David Garcia, ..."
1,34.27.174.160,Opera/9.61.(Windows NT 5.2; kw-GB) Presto/2.9....,"[[0.1289354939824443, 0.11585562099465954, 0.0...",1,176,108,3.212139,"[Elizabeth Montoya, Alicia Ross, Jesus Williams]"
2,128.226.172.153,Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_10_...,"[[0.20241240036925345, 0.024927698906796372, 0...",5,576,319,3.244989,"[Michael Martinez, John Murray, Krystal Vega]"
3,38.18.93.98,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,"[[0.002105115060266282, 0.03945108400596769, 0...",2,616,195,3.113359,"[Tracey Hobbs, Alicia Ross, Michael Martinez, ..."
4,73.25.5.34,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,"[[0.049384831882688766, 0.06746755307935609, 0...",9,28,144,0.581322,"[Melanie Mendoza, Rebecca Osborn]"
...,...,...,...,...,...,...,...,...
995,210.97.61.195,Mozilla/5.0 (Windows NT 4.0; tl-PH; rv:1.9.2.2...,"[[0.002175085542831685, 0.019960706449079074, ...",8,380,93,3.258268,"[Sara Mayo, Rebecca Osborn, Jeremy Lewis]"
996,43.96.142.160,Mozilla/5.0 (Linux; Android 2.3.4) AppleWebKit...,"[[0.30934513304060796, 0.0796760637223416, 0.0...",8,612,85,1.968254,[Monica King]
997,68.143.29.228,Mozilla/5.0 (Linux; Android 5.0) AppleWebKit/5...,"[[0.029079724279230404, 0.3404752938397516, 0....",17,456,478,2.001091,"[Joshua Hays, Linda Smith, David Garcia, Cathe..."
998,110.232.230.123,Mozilla/5.0 (compatible; MSIE 7.0; Windows NT ...,"[[0.035386557969520026, 0.17286989120455212, 0...",11,96,504,2.625468,"[Kevin Wilson, Alicia Ross, Paula Mccormick, G..."


In [83]:
users_df.describe()

Unnamed: 0,freq,first_day,lifetime,ad_sensitivity
count,1000.0,1000.0,1000.0,1000.0
mean,5.286,364.806,231.183,2.965594
std,4.340627,215.249071,205.060448,1.00177
min,0.0,0.0,20.0,-0.498781
25%,1.0,180.0,81.0,2.336426
50%,5.0,367.0,172.5,2.974082
75%,8.0,560.0,317.25,3.601715
max,23.0,729.0,1618.0,6.34694


In [84]:
class Strategy:
    def __init__(self):
        self.ads = 3
        self.free_articles = 10
        self.cost = 8.99

class Session:
    def __init__(self, user_id, row, day, events, articles):
        # here a user will accumulate one or more pageviews.
        # each time a pageview is accumulated, the user and the pageview will be passed to a
        # strategy object, which will determine if the user will be asked to pay, and if not,
        # how many ads the user will show.
        self.user = row
        self.user.id = user_id
        # dummy return value showing struct of a pageview
    def pageview(article):
        pass
    def get_pageviews(self):
        return [{ 'article_id': 0, 'user_id': self.user.id, 'day': day, 'duration': 90 }] # duration is in seconds

lambda session_factory = (user, day, events, articles): Session(user, day, events, articles)

pvs = []
for day in range(0, N_DAYS_PERIOD_0):
    # what events are live today?
    events_today = events_df[(events_df.start <= day) & (events_df.end >= day)]
    # what users are eligible to visit today?
    users_today = users_df[users_df.first_day <= day]
    # what articles might they see?
    if (len(events_today)):
        longtail = min(events_today.start)
    else:
        longtail = day
    articles_today = articles_df[(articles_df.day >= longtail) & (articles_df.day <= day)]
    for (idx, user) in users_today.iterrows():
        s = Session(idx, user, day, events_today, articles_today)
        pvs += s.get_pageviews()

pvs_df = pd.DataFrame(pvs)
pvs_df.describe()

Unnamed: 0,article_id,user_id,day,duration
count,93273.0,93273.0,93273.0,93273.0
mean,0.0,506.586954,238.215722,90.0
std,0.0,285.540668,88.838502,0.0
min,0.0,0.0,0.0,90.0
25%,0.0,256.0,174.0,90.0
50%,0.0,513.0,253.0,90.0
75%,0.0,755.0,314.0,90.0
max,0.0,998.0,364.0,90.0
