In [54]:
import numpy as np
import pandas as pd
import random
from scipy.stats import dirichlet, expon, uniform, norm
from faker import Faker
fake = Faker()

In [55]:
# Define some global parameters
N_DAYS = 365*2
N_AUTHORS = 50
N_USERS = 1000

# Define possible topics and baseline probabilities // right now all made up, but should be based on data eventually
topics = ['Opinion', 'Politics', 'World Events', 'Business', 'Technology', 'Arts & Culture', 'Sports', 'Health', 'Home', 'Travel', 'Fashion', 'Food']
topics_probs = np.array([[0.1, 0.1, 0.1, 0.1, 0.08, 0.08, 0.08, 0.08, 0.08, 0.07, 0.07, 0.06]])

# Generate fake authors and probability vectors
authors_names = [fake.name() for i in range(N_AUTHORS)]
authors_quality = [uniform.rvs()*10 for i in range(N_AUTHORS)]

alphas = np.ones(len(topics)) # Concentration parameters TBD, right now uniform
authors_topicsprobs = np.array([dirichlet.rvs(alphas)[0] for auth in authors_names])
authors_popularity = dirichlet.rvs(np.ones(N_AUTHORS)*10)[0]

In [56]:
events = []
articles = []

# Iterate over days to generate all articles
for day in range(N_DAYS):

    # Generate new event with some probability (TBD)
    for _ in range(0, int(norm.rvs(loc=1, scale=2))):
        event_influence = dirichlet.rvs(alphas) # Concentration parameters TBD
        event_duration = expon.rvs(loc=0.01, scale=0.1) # some events should be long-lived
        duration_days = np.ceil(event_duration * 30)
        # hmm, event duration and intensity is generally correlated
        event_intensity = expon.rvs(scale=0.1, loc=0.1)
        events.append({'id': len(events), 'start': day, 'influence': event_influence, 'duration': duration_days, 'intensity': event_intensity})

    # Loop through events and sum topic probabilities of all active events
    day_topicsprobs = topics_probs.copy()
    for event in events:
        if event['start'] + event['duration'] >= day:
            day_topicsprobs += event['intensity']*event['influence']

    # Normalize probabilities
    day_topicsprobs = day_topicsprobs[0] / day_topicsprobs[0].sum()

    # Generate articles for day
    n_articles = round(norm.rvs(loc=N_AUTHORS / 3, scale=N_AUTHORS / 10))
    for i in range(n_articles):
        article_topic = np.random.choice(topics, p=day_topicsprobs)
        # Normalize probabilities
        author_probs = authors_topicsprobs[:, topics.index(article_topic)]
        author_probs = author_probs / author_probs.sum()
        article_author = np.random.choice(authors_names, p=author_probs)
        articles.append({'id': len(articles), 'day': day, 'topic': article_topic, 'author': article_author})

events_df = pd.DataFrame(events)
articles_df = pd.DataFrame(articles)

events_df.describe()

Unnamed: 0,id,start,duration,intensity
count,733.0,733.0,733.0,733.0
mean,366.0,356.978172,3.976808,0.20001
std,211.743162,210.343148,3.126348,0.104661
min,0.0,0.0,1.0,0.100377
25%,183.0,165.0,2.0,0.126996
50%,366.0,365.0,3.0,0.166656
75%,549.0,535.0,5.0,0.239944
max,732.0,728.0,26.0,1.089871


In [57]:
articles_df.describe()

Unnamed: 0,id,day
count,12203.0,12203.0
mean,6101.0,360.604933
std,3522.847002,211.167557
min,0.0,0.0
25%,3050.5,176.0
50%,6101.0,359.0
75%,9151.5,544.0
max,12202.0,729.0


In [58]:
articles_df.groupby('topic').describe()

Unnamed: 0_level_0,id,id,id,id,id,id,id,id,day,day,day,day,day,day,day,day
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
topic,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Arts & Culture,1015.0,6220.692611,3508.743175,3.0,3340.5,6246.0,9305.0,12201.0,1015.0,367.75665,210.392319,0.0,193.5,367.0,552.5,729.0
Business,1085.0,6077.266359,3510.322391,47.0,2942.0,6055.0,8993.0,12196.0,1085.0,359.129954,210.458756,2.0,170.0,356.0,535.0,729.0
Fashion,936.0,6241.264957,3437.677275,19.0,3467.0,6504.0,9054.5,12146.0,936.0,368.981838,206.06162,1.0,200.75,384.5,538.25,726.0
Food,919.0,6129.760609,3456.862721,17.0,3235.0,6035.0,9093.0,12198.0,919.0,362.258977,207.51647,1.0,188.0,355.0,540.5,729.0
Health,1013.0,6079.922014,3566.026711,2.0,2870.0,5958.0,9294.0,12199.0,1013.0,359.343534,213.785739,0.0,167.0,349.0,552.0,729.0
Home,1020.0,5945.716667,3521.391908,7.0,2964.0,5802.0,9056.0,12193.0,1020.0,351.271569,211.01681,1.0,171.0,340.0,538.5,729.0
Opinion,1148.0,6067.351916,3538.256259,8.0,3019.5,6040.0,9303.25,12185.0,1148.0,358.594948,211.95427,1.0,173.5,355.5,552.25,729.0
Politics,1100.0,5984.511818,3591.478142,6.0,2928.25,6047.5,8980.25,12202.0,1100.0,353.802727,215.112587,1.0,169.0,356.0,534.0,729.0
Sports,998.0,6016.43487,3492.46711,0.0,3027.75,5929.0,9061.75,12200.0,998.0,355.451904,209.361184,0.0,174.25,348.5,538.75,729.0
Technology,960.0,6014.670833,3561.104339,15.0,2821.25,5994.0,9104.75,12192.0,960.0,355.478125,213.306922,1.0,163.75,352.5,541.25,729.0


In [65]:
# Generate users
users = []
# will depend on how calculations are done
# users_tod = ['const day', 'const night', 'morning peak', 'midday peak', 'evening peak'']

for user in range(N_USERS):
    ip = fake.ipv4()
    agent = fake.user_agent()
    prefs = dirichlet.rvs(alphas)
    freq = norm.rvs(loc=5, scale=5)
    first_day = int(uniform.rvs()*N_DAYS)
    lifetime = expon.rvs(loc=20, scale=200)

    num_favorite = np.ceil(uniform.rvs()*4)
    favorites = random.sample(authors_names, int(num_favorite))
    users.append({'ip': ip, 'agent': agent, 'prefs': prefs, 'freq': max(freq, 0),
                  'first_day': first_day, 'lifetime': lifetime, 'favorites': favorites})

In [66]:
users_df = pd.DataFrame(users)
users_df

Unnamed: 0,ip,agent,prefs,freq,first_day,lifetime,favorites
0,209.191.63.10,Mozilla/5.0 (compatible; MSIE 6.0; Windows CE;...,"[[0.11620120198506441, 0.033543403690835624, 0...",9.932736,394,154.977672,"[Matthew Taylor, Andrew Mccann, Molly Oneal, D..."
1,126.213.197.252,Mozilla/5.0 (iPod; U; CPU iPhone OS 4_2 like M...,"[[0.16940690812818207, 0.1676874991123693, 0.1...",6.978770,519,756.732084,"[Jordan Holland, Shelby Williams, Roberto Rowe]"
2,60.57.41.175,Opera/8.86.(Windows NT 10.0; ta-IN) Presto/2.9...,"[[0.12732412409790334, 0.15844066780961727, 0....",3.981760,274,54.153007,[Danielle Gonzales]
3,10.236.20.159,Mozilla/5.0 (Linux; Android 4.2) AppleWebKit/5...,"[[0.027263718637641984, 0.12398082128068787, 0...",0.000000,261,145.772394,"[Joshua Ortega, Sara Manning, Daniel Green, Mo..."
4,56.200.32.82,Mozilla/5.0 (iPad; CPU iPad OS 10_3_4 like Mac...,"[[0.04652380242319909, 0.05269459557114592, 0....",7.510013,436,107.400039,"[Kristin Morris, Benjamin Robinson, James Swan..."
...,...,...,...,...,...,...,...
995,102.201.13.53,Mozilla/5.0 (compatible; MSIE 7.0; Windows NT ...,"[[0.05012243117880853, 0.08536504601852563, 0....",0.000000,612,29.985426,[Danielle Gonzales]
996,44.94.89.166,Mozilla/5.0 (Android 3.0; Mobile; rv:41.0) Gec...,"[[0.23870948159681782, 0.048817144234657855, 0...",0.000000,117,57.049679,"[James Swanson, Zachary Smith]"
997,126.115.0.21,Mozilla/5.0 (compatible; MSIE 6.0; Windows CE;...,"[[0.053368804756425, 0.06307578231453158, 0.14...",5.643541,359,115.072514,[Donald Hall]
998,126.162.173.92,Mozilla/5.0 (compatible; MSIE 6.0; Windows NT ...,"[[0.10662590672507669, 0.08031885545944624, 0....",7.296640,278,455.416150,[Mike Davis]
