In [44]:
import numpy as np
import pandas as pd
from scipy.stats import dirichlet, expon, uniform, norm
from faker import Faker
fake = Faker()

In [45]:
# Define some global parameters
N_DAYS = 365*2
N_AUTHORS = 50

# Define possible topics and baseline probabilities // right now all made up, but should be based on data eventually
topics = ['Opinion', 'Politics', 'World Events', 'Business', 'Technology', 'Arts & Culture', 'Sports', 'Health', 'Home', 'Travel', 'Fashion', 'Food']
topics_probs = np.array([[0.1, 0.1, 0.1, 0.1, 0.08, 0.08, 0.08, 0.08, 0.08, 0.07, 0.07, 0.06]])

# Generate fake authors and probability vectors
authors_names = [fake.name() for i in range(N_AUTHORS)]

alphas = np.ones(len(topics)) # Concentration parameters TBD, right now uniform
authors_topicsprobs = np.array([dirichlet.rvs(alphas)[0] for auth in authors_names])

events = []
articles = []

# Iterate over days to generate all articles
for day in range(N_DAYS):

    # Generate new event with some probability (TBD)
    for _ in range(0, int(norm.rvs(loc=1, scale=2))):
        event_influence = dirichlet.rvs(alphas) # Concentration parameters TBD
        event_duration = expon.rvs(loc=0.01, scale=0.1) # some events should be long-lived
        duration_days = np.ceil(event_duration * 30)
        # hmm, event duration and intensity is generally correlated
        event_intensity = expon.rvs(scale=0.1, loc=0.1)
        events.append({'id': len(events), 'start': day, 'influence': event_influence, 'duration': duration_days, 'intensity': event_intensity})

    # Loop through events and sum topic probabilities of all active events
    day_topicsprobs = topics_probs.copy()
    for event in events:
        if event['start'] + event['duration'] >= day:
            day_topicsprobs += event['intensity']*event['influence']

    # Normalize probabilities
    day_topicsprobs = day_topicsprobs[0] / day_topicsprobs[0].sum()

    # Generate articles for day
    n_articles = round(norm.rvs(loc=N_AUTHORS / 3, scale=N_AUTHORS / 10))
    for i in range(n_articles):
        article_topic = np.random.choice(topics, p=day_topicsprobs)
        # Normalize probabilities
        author_probs = authors_topicsprobs[:, topics.index(article_topic)]
        author_probs = author_probs / author_probs.sum()
        article_author = np.random.choice(authors_names, p=author_probs)
        articles.append({'id': len(articles), 'day': day, 'topic': article_topic, 'author': article_author})

events_df = pd.DataFrame(events)
articles_df = pd.DataFrame(articles)

events_df.describe()

Unnamed: 0,id,start,duration,intensity
count,815.0,815.0,815.0,815.0
mean,407.0,370.957055,3.777914,0.201271
std,235.414528,202.680112,3.03558,0.101339
min,0.0,2.0,1.0,0.100012
25%,203.5,203.5,2.0,0.128322
50%,407.0,368.0,3.0,0.170034
75%,610.5,550.0,5.0,0.245088
max,814.0,728.0,20.0,0.828157


In [46]:
articles_df.describe()

Unnamed: 0,id,day
count,12211.0,12211.0
mean,6105.0,362.827205
std,3525.156403,209.643456
min,0.0,0.0
25%,3052.5,181.0
50%,6105.0,361.0
75%,9157.5,542.0
max,12210.0,729.0


In [47]:
articles_df.groupby('topic').describe()

Unnamed: 0_level_0,id,id,id,id,id,id,id,id,day,day,day,day,day,day,day,day
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
topic,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Arts & Culture,987.0,6289.624113,3577.151613,6.0,3072.5,6380.0,9461.5,12168.0,987.0,373.93921,212.874279,0.0,182.5,381.0,561.0,727.0
Business,1119.0,6116.484361,3482.022229,9.0,3076.5,6126.0,9090.5,12187.0,1119.0,363.475424,207.033444,0.0,182.5,362.0,537.0,728.0
Fashion,1023.0,6412.439883,3473.023459,4.0,3565.5,6557.0,9248.5,12207.0,1023.0,381.13783,206.712965,0.0,212.0,391.0,547.5,729.0
Food,850.0,6107.778824,3471.910774,1.0,3222.5,6071.5,9056.5,12208.0,850.0,362.98,206.552066,0.0,189.5,358.0,535.0,729.0
Health,951.0,6034.329127,3522.441204,8.0,3011.0,6090.0,9061.5,12195.0,951.0,358.599369,209.434689,0.0,179.0,360.0,535.0,729.0
Home,978.0,6027.558282,3617.203052,0.0,2762.25,6089.5,9328.5,12194.0,978.0,358.329243,215.127112,0.0,165.0,360.0,552.75,729.0
Opinion,1165.0,6022.834335,3567.824582,35.0,2928.0,5874.0,9236.0,12210.0,1165.0,358.091845,212.27815,1.0,175.0,347.0,547.0,729.0
Politics,1108.0,5911.452166,3522.025141,3.0,2844.25,5727.0,8999.25,12206.0,1108.0,351.195848,209.288636,0.0,169.75,338.0,532.25,729.0
Sports,1026.0,6195.890838,3440.064698,39.0,3288.0,6316.5,8994.0,12180.0,1026.0,368.159844,204.558842,2.0,193.25,375.5,532.0,728.0
Technology,1012.0,5921.337945,3535.29279,47.0,2790.75,5877.5,8977.25,12201.0,1012.0,351.848814,210.015618,2.0,166.0,347.0,531.5,729.0
