In [1]:
import pandas as pd, numpy as np
import random
import json

In [2]:
genders = ["Male", "Female", "Unknown"]
ages = [3,4,5,6,7,8]
n_users = 100
start = pd.to_datetime('2020-12-01')
end = pd.to_datetime('2020-12-30')
random.seed(1234)

In [3]:
users = pd.DataFrame({"user_id" : np.arange(1000, 1000+n_users)})

In [4]:
users["gender"] = random.choices(genders, k=n_users)

In [5]:
users["age"] = random.choices(ages, k=n_users)

In [6]:
users.to_csv('users.csv')

In [31]:
def get_lessons(subject):
    with open('../../assets/courses/'+subject+'/'+subject+'/res/course.json') as f:
        subject_json = json.load(f)
    lessons = []
    chapter_index = 0
    lesson_index = 0
    details = {}
    for chapter in subject_json['chapters']:
        if 'type' not in chapter or chapter['type'] != 'exam':
            details[chapter_index] = lesson_index
            chapter_index = chapter_index + 1
            lesson_index = 0
            for lesson in chapter['lessons']:
                if 'type' in lesson and lesson['type'] == 'exam':
                    lesson_index = lesson_index + 1
                    lessons.append((chapter['id'],lesson['id'], chapter_index, lesson_index))
    details[chapter_index] = lesson_index
    ret_lessons = []
    completion = 0
    for lesson in lessons:
        completion += 1/lessons[-1][2] * 1/details[lesson[2]]
        ret_lessons.append((lesson[0], lesson[1], completion, lesson[3]/details[lesson[2]]))
    return ret_lessons

In [8]:
def random_dates(start, end, n=10):
    start_u = start.value//10**9
    end_u = end.value//10**9

    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s').sort_values()

In [38]:
def random_scores(user_id, subject, lessons):
    n_scores = random.randint(50, len(lessons))
    random_lessons = random.sample(range(len(lessons)), k=n_scores)
    random_lessons.sort()
    scores = pd.DataFrame({"user_id" : [user_id]*n_scores})
    scores["subject"] = [subject]*n_scores
    scores["chapter"] = [i[0] for i in lessons[:n_scores]]
    scores["course_completion_percentage"] = [i[2] for i in lessons[:n_scores]]
    scores["lesson"] = [i[1] for i in lessons[:n_scores]]
    scores["chapter_completion_percentage"] = [i[3] for i in lessons[:n_scores]]
    scores["score"] = random.choices(np.arange(70, 100, 1), k=n_scores)
    scores["datetime"] = random_dates(start, end, n=n_scores)
    return scores

In [39]:
subjects = {
    'en': get_lessons('en'),
    'maths': get_lessons('maths')
}

In [33]:
subjects

{'en': [('en00', 'ambulance', 0.016129032258064516, 0.5),
  ('en00', 'anchor', 0.03225806451612903, 1.0),
  ('en01', 'auto', 0.04838709677419355, 0.5),
  ('en01', 'pram', 0.06451612903225806, 1.0),
  ('en02', 'bag', 0.08064516129032258, 0.5),
  ('en02', 'ball', 0.0967741935483871, 1.0),
  ('en03', 'balloon', 0.11290322580645161, 0.5),
  ('en03', 'bell', 0.12903225806451613, 1.0),
  ('en04', 'balloon1', 0.13978494623655913, 0.3333333333333333),
  ('en04', 'balloon2', 0.15053763440860213, 0.6666666666666666),
  ('en04', 'balloon3', 0.16129032258064513, 1.0),
  ('en06', 'balloon4', 0.17741935483870963, 0.5),
  ('en06', 'balloon5', 0.19354838709677413, 1.0),
  ('en05', 'balloon6', 0.22580645161290316, 1.0),
  ('en07', 'balloon7', 0.2419354838709677, 0.5),
  ('en07', 'balloon8', 0.2580645161290322, 1.0),
  ('en08', 'balloon9', 0.2741935483870967, 0.5),
  ('en08', 'balloon10', 0.2903225806451612, 1.0),
  ('en09', 'balloon11', 0.3010752688172042, 0.3333333333333333),
  ('en09', 'balloon12', 0

In [40]:
df = None
for row in users.itertuples():
    for subject in subjects.items():
        if df is None:
            df = random_scores(row[1], subject[0], subject[1])
        else:
            df = pd.concat([df, random_scores(row[1], subject[0], subject[1])])

In [41]:
df.head()

Unnamed: 0,user_id,subject,chapter,course_completion_percentage,lesson,chapter_completion_percentage,score,datetime
0,1000,en,en00,0.016129,ambulance,0.5,87,2020-12-01 11:01:21
1,1000,en,en00,0.032258,anchor,1.0,85,2020-12-01 11:43:56
2,1000,en,en01,0.048387,auto,0.5,90,2020-12-01 17:01:05
3,1000,en,en01,0.064516,pram,1.0,78,2020-12-02 06:38:38
4,1000,en,en02,0.080645,bag,0.5,77,2020-12-02 06:57:37


In [42]:
df.to_csv('scores.csv')