In [1]:
from manrs.models import Base, Report, Result
import config
from sqlalchemy.orm import sessionmaker
from sqlalchemy import or_, func
import numpy as np
from dateutil.relativedelta import relativedelta
from datetime import datetime
from itertools import repeat
from random import random, sample, choice

In [2]:
Session = sessionmaker(config.DB_ENGINE)
session = Session()
month = relativedelta(months=1)

In [3]:
def get_asns(session):
    asns = session.query(Result.asn).distinct().all()
    return [i[0] for i in asns]

In [4]:
def get_reports(session):
    return session.query(Report.id, Report.period_start).all()


In [5]:
def get_last_report(session):
     return session.query(func.max(Report.period_start)).one()[0]

In [6]:
def gen_dates(start):
    cursor = start
    while cursor < datetime.now() + relativedelta(years=1):
        cursor += month
        yield cursor

In [7]:
def sampler(mu, sigma, num):
    """ samples from a random distribution and clips values below 0"""
    samples = np.random.normal(mu, sigma, num)
    return np.clip(samples, a_min=0, a_max=None)

In [8]:
def get_old_reports(session):
    return session.query(Report).all()

In [9]:
def generate_reports(new_dates):
    """"Add new fanasy reports to the database"""
    reports = []
    for date in new_dates:
        end = date + month
        report = Report(period_start=date, period_end=end, date_finished=end, type="auto")
        reports.append(report)
    return reports

In [10]:
def get_stats(session, asn):
    """Build up a statistical model of a specific asn"""
    R = Result  # simple shortcut alias
    float_rows = session.query(R.m1, R.m1c, R.m2, R.m2c, R.m3, R.m7irr).filter(Result.asn == asn).all()
    array = np.array(float_rows, dtype=float)
    mu = np.mean(array, axis=0)
    sigma = np.std(array, axis=0)
    
    bool_rows = session.query(R.m6, R.m8).filter(Result.asn == asn).all()
    m6, m8 = list(zip(*bool_rows))
    
    m6_total = sum(type(i) == bool for i in m6)
    if m6_total:
        m6_ratio = sum(i==True for i in m6) / m6_total
    else:
        m6_ratio = 0
    
    m8_total = sum(type(i) == bool for i in m8)
    if m8_total:
        m8_ratio = sum(i==True for i in m8) / m8_total
    else:
        m8_ratio = 0
        
    return mu, sigma, m6_ratio, m8_ratio

In [11]:
def generate_stats(reports, asn, mu, sigma, m6_ratio, m8_ratio):
    """A simple simulation assuming the data is a normal distribution"""
    
    num = len(reports)
    
    m1s    = sampler(mu[0], sigma[0], num)
    m1cs   = sampler(mu[1], sigma[1], num)
    m2s    = sampler(mu[2], sigma[2], num)
    m2cs   = sampler(mu[3], sigma[3], num)
    m3s    = sampler(mu[4], sigma[4], num)
    m7irrs = sampler(mu[5], sigma[5], num)
    
    m6 = (random() < m6_ratio for i in range(num))
    m8 = (random() < m8_ratio for i in range(num))
    
    zipped = zip(reports, m1s, m1cs, m2s, m2cs, m3s, m6, m7irrs, m8)
    
    results = []
    for report, m1, m1c, m2, m2c, m3, m6, m7irr, m8 in zipped:
        results.append(Result(asn=asn, report=report, m1=m1, m1c=m1c, m2=m2,
                              m2c=m2c, m3=m3, m6=m6, m7irr=m7irr, m8=m8))
    return results
        

In [12]:
def generate_new_reports(session, new_dates, asns):
    reports = generate_reports(new_dates)
    session.add_all(reports)

    results = []
    for asn in asns:
        mu, sigma, m6_ratio, m8_ratio = get_stats(session, asn)
        results += generate_stats(reports, asn, mu, sigma, m6_ratio, m8_ratio)
    return results

In [13]:
def get_all_stats(session, asns):
    all_stats = []
    for asn in asns:
        all_stats.append(get_stats(session, asn))
    return all_stats

In [14]:
def generate_asn_data(real_asns, reports, min_asn=0, max_asn=65000, num_asn=60000):
    # generate new ASNs, making sure we don't have duplicates
    all_stats = get_all_stats(session, real_asns)
    new_asns = sample(set(range(min_asn, max_asn)) - set(real_asns), num_asn)
    results = []
    for asn in new_asns:
        mu, sigma, m6_ratio, m8_ratio = choice(all_stats)
        results += generate_stats(reports, asn, mu, sigma, m6_ratio, m8_ratio)
    return results

In [15]:
# generate new ASN data
old_reports = get_old_reports(session)
asns = get_asns(session)
results = generate_asn_data(asns, old_reports, num_asn=10)
session.add_all(results)

In [16]:
# now fantasize new reports
last_report = get_last_report(session)
new_dates = list(gen_dates(last_report))
asns = get_asns(session)
results = generate_new_reports(session, new_dates, asns)
session.add_all(results)

In [17]:
#session.commit()