In [1]:
import numpy as np
import pandas as pd
import random
from scipy.stats import dirichlet, expon, uniform, norm
from faker import Faker
fake = Faker()

In [2]:
# Define some global parameters
N_DAYS = 365*2
N_AUTHORS = 50
N_USERS = 1000

# Define possible topics and baseline probabilities // right now all made up, but should be based on data eventually
topics = ['Opinion', 'Politics', 'World Events', 'Business', 'Technology', 'Arts & Culture', 'Sports', 'Health', 'Home', 'Travel', 'Fashion', 'Food']
topics_probs = np.array([[0.1, 0.1, 0.1, 0.1, 0.08, 0.08, 0.08, 0.08, 0.08, 0.07, 0.07, 0.06]])

# Generate fake authors and probability vectors
authors_names = [fake.name() for i in range(N_AUTHORS)]
authors_quality = [uniform.rvs()*10 for i in range(N_AUTHORS)]

alphas = np.ones(len(topics)) # Concentration parameters TBD, right now uniform
authors_topicsprobs = np.array([dirichlet.rvs(alphas)[0] for auth in authors_names])
authors_popularity = dirichlet.rvs(np.ones(N_AUTHORS)*10)[0]

In [3]:
events = []
articles = []

# Iterate over days to generate all articles
for day in range(N_DAYS):

    # Generate new event with some probability (TBD)
    for _ in range(0, int(norm.rvs(loc=1, scale=2))):
        event_influence = dirichlet.rvs(alphas) # Concentration parameters TBD
        event_duration = expon.rvs(loc=0.01, scale=0.1) # some events should be long-lived
        duration_days = np.ceil(event_duration * 30)
        # hmm, event duration and intensity is generally correlated
        event_intensity = expon.rvs(scale=0.1, loc=0.1)
        events.append({'id': len(events), 'start': day, 'influence': event_influence, 'duration': duration_days, 'intensity': event_intensity})

    # Loop through events and sum topic probabilities of all active events
    day_topicsprobs = topics_probs.copy()
    for event in events:
        if event['start'] + event['duration'] >= day:
            day_topicsprobs += event['intensity']*event['influence']

    # Normalize probabilities
    day_topicsprobs = day_topicsprobs[0] / day_topicsprobs[0].sum()

    # Generate articles for day
    n_articles = round(norm.rvs(loc=N_AUTHORS / 3, scale=N_AUTHORS / 10))
    for i in range(n_articles):
        article_topic = np.random.choice(topics, p=day_topicsprobs)
        # Normalize probabilities
        author_probs = authors_topicsprobs[:, topics.index(article_topic)]
        author_probs = author_probs / author_probs.sum()
        article_author = np.random.choice(authors_names, p=author_probs)
        articles.append({'id': len(articles), 'day': day, 'topic': article_topic, 'author': article_author})

events_df = pd.DataFrame(events)
articles_df = pd.DataFrame(articles)

events_df.describe()

Unnamed: 0,id,start,duration,intensity
count,812.0,812.0,812.0,812.0
mean,405.5,351.857143,3.935961,0.201609
std,234.548502,207.255257,3.093422,0.102752
min,0.0,0.0,1.0,0.100036
25%,202.75,170.75,2.0,0.130018
50%,405.5,340.0,3.0,0.169464
75%,608.25,517.0,5.0,0.236209
max,811.0,728.0,23.0,0.78504


In [4]:
articles_df.describe()

Unnamed: 0,id,day
count,12269.0,12269.0
mean,6134.0,365.227076
std,3541.899561,211.060908
min,0.0,0.0
25%,3067.0,182.0
50%,6134.0,366.0
75%,9201.0,546.0
max,12268.0,729.0


In [5]:
articles_df.groupby('topic').describe()

Unnamed: 0_level_0,id,id,id,id,id,id,id,id,day,day,day,day,day,day,day,day
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
topic,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Arts & Culture,983.0,6499.62767,3470.808903,29.0,3533.5,6788.0,9444.5,12259.0,983.0,387.045778,206.716624,1.0,209.5,407.0,562.0,728.0
Business,1202.0,6092.211314,3479.471702,8.0,3181.25,5989.0,9112.75,12251.0,1202.0,362.764559,207.39142,0.0,189.0,356.0,540.75,728.0
Fashion,939.0,5978.13312,3614.679955,6.0,2740.0,5770.0,9287.0,12234.0,939.0,355.972311,215.38805,0.0,164.0,344.0,553.0,726.0
Food,853.0,6153.900352,3521.053384,4.0,3254.0,5947.0,9180.0,12235.0,853.0,366.508792,209.748718,0.0,193.0,354.0,545.0,726.0
Health,1008.0,6316.484127,3435.044801,24.0,3388.25,6571.5,9139.75,12245.0,1008.0,375.952381,204.757516,0.0,201.75,392.0,542.25,727.0
Home,1007.0,5964.553128,3510.350562,28.0,2959.5,5880.0,8813.5,12258.0,1007.0,355.103277,209.238716,1.0,176.5,351.0,521.0,728.0
Opinion,1183.0,6110.291631,3589.496256,0.0,3081.5,6050.0,9279.5,12261.0,1183.0,363.76585,213.920384,0.0,183.0,361.0,552.5,729.0
Politics,1129.0,6157.721878,3674.148749,7.0,2918.0,5984.0,9543.0,12254.0,1129.0,366.720106,218.968552,0.0,174.0,356.0,567.0,728.0
Sports,964.0,6169.717842,3487.510434,3.0,3144.25,6238.5,9148.0,12255.0,964.0,367.330913,207.761869,0.0,186.0,371.0,542.25,728.0
Technology,935.0,6114.02246,3610.447911,1.0,2907.0,6062.0,9318.5,12242.0,935.0,364.055615,215.155322,0.0,173.5,361.0,555.0,727.0


In [12]:
# Generate users
users = []
# will depend on how calculations are done

for user in range(N_USERS):
    ip = fake.ipv4() # not sure if these first two are necessary
    agent = fake.user_agent()
    prefs = dirichlet.rvs(alphas)
    freq = int(norm.rvs(loc=5, scale=5)) # should be bimodal, will investigate
    first_day = int(uniform.rvs()*N_DAYS)
    lifetime = int(expon.rvs(loc=20, scale=200))
    ad_sensitivity = norm.rvs(loc=2, scale=0.2)

    num_favorite = np.ceil(uniform.rvs()*4)
    favorites = random.sample(authors_names, int(num_favorite))
    users.append({'ip': ip, 'agent': agent, 'prefs': prefs, 'freq': max(freq, 0),
                  'first_day': first_day, 'lifetime': lifetime, 'ad_sensitivity': ad_sensitivity,
                  'favorites': favorites})

In [13]:
users_df = pd.DataFrame(users)
users_df

Unnamed: 0,ip,agent,prefs,freq,first_day,lifetime,ad_sensitivity,favorites
0,56.150.117.68,Opera/8.89.(Windows NT 5.01; mi-NZ) Presto/2.9...,"[[0.007360383115440668, 0.24262506824112237, 0...",9,406,336,1.890214,"[Amber Conrad, Anna Paul, Summer Cooper, Jeffr..."
1,200.138.183.159,Mozilla/5.0 (iPad; CPU iPad OS 5_1_1 like Mac ...,"[[0.0003219440359596212, 0.07805357963153772, ...",10,553,114,1.952494,"[Timothy Barber, John Leonard]"
2,191.141.230.178,Mozilla/5.0 (iPad; CPU iPad OS 9_3_5 like Mac ...,"[[0.3451409965398571, 0.07128343202032611, 0.0...",0,581,23,2.080753,"[Stephen Martin, Angela Quinn, Anna Paul]"
3,161.193.76.105,Mozilla/5.0 (iPod; U; CPU iPhone OS 3_3 like M...,"[[0.044746905311257336, 0.1292045243856901, 0....",0,356,622,2.077663,"[Steven Wilson, Michael Marks]"
4,213.25.71.98,Mozilla/5.0 (iPad; CPU iPad OS 14_2_1 like Mac...,"[[0.02838256503038247, 0.029219214813447275, 0...",6,260,104,2.009152,"[James Carter, Daniel Williams, Marcus Gilbert..."
...,...,...,...,...,...,...,...,...
995,159.87.121.16,Opera/9.41.(X11; Linux x86_64; aa-DJ) Presto/2...,"[[0.06443744160040796, 0.1409057062783758, 0.2...",2,286,198,1.841452,"[Donald Roberts, Cody Rowland]"
996,195.143.140.125,Mozilla/5.0 (X11; Linux i686; rv:1.9.6.20) Gec...,"[[0.10570553390363821, 0.01903586253419786, 0....",7,692,303,2.087367,"[Amber Conrad, Benjamin Becker]"
997,123.68.126.166,Opera/9.23.(X11; Linux i686; it-CH) Presto/2.9...,"[[0.14751608219080675, 0.16493720139565118, 0....",6,297,214,2.190526,[Sarah Marshall]
998,158.195.169.220,Mozilla/5.0 (Windows CE; ht-HT; rv:1.9.1.20) G...,"[[0.05847297968383385, 0.22396140008110005, 0....",2,462,301,1.827772,"[Katherine Murphy, Summer Cooper, Cynthia Step..."
