In [1]:
import requests, os, re, threading, jsonlines, json, io, math
from dotenv import load_dotenv
from collections import defaultdict
import boto3, time

session = boto3.Session(profile_name="stryvia")
s3Client = session.client("s3")

load_dotenv()
splitSymbols = ["=", "+", "&", "->"]

In [2]:
renderIds=dict()
threads=[]
sampleProps=[]

In [3]:
def getAssetUrl(query):
    # return os.environ.get("blackVideoUrl")
    url = f"https://api.pexels.com/videos/search?query={query}&per_page=1&size=small&orientation=landscape"
    headers = {"Authorization": os.environ.get("pexelsKey")}
    try:
        response = requests.get(url=url, headers=headers)
        data=response.json()
        return data['videos'][0]['video_files'][0]['link']
    except Exception as e:
        print(e)
        return os.environ.get("blackVideoUrl")

In [4]:
def getGoogleImage(query):
    # return os.environ.get("blackImageUrl")
    try:
        response=requests.post(os.environ.get("imageScraper"),json=dict(query=query))
        return response.json()
    except Exception as e:
        print(e)
        return os.environ.get("blackImageUrl")


In [5]:
def cleanExpression(expression):
    return expression.replace('.','')

In [6]:
def processExpression(i,expression):
    try:
        format = {}

        for symbol in splitSymbols:
            if symbol in expression:
                format["operation"] = symbol
                format["content"] = []
                for part in expression.split(symbol):
                    if len(part):
                        format["content"].append(processExpression(i,part))
                return format

        if "\u201c" in expression:
            format["operation"] = "verb"
            format["content"] = []
            pattern = r"\u201c(.*?)\u201d"
            withoutVerb = re.sub(pattern, "$$", expression)
            format["verb"] = re.findall(pattern, expression)[0]
            for part in withoutVerb.split("$$"):
                if len(part):
                    format["content"].append(processExpression(i,part))
            return format
        if "\'" in expression:
            format["operation"] = "verb"
            format["content"] = []
            pattern = r"\'(.*?)\'"
            withoutVerb = re.sub(pattern, "$$", expression)
            format["verb"] = re.findall(pattern, expression)[0]
            for part in withoutVerb.split("$$"):
                if len(part):
                    format["content"].append(processExpression(i,part))
            return format


        query = expression.strip()
        if not len(query):
            format["operation"]="void"
            return format
        format["operation"] = "asset"
        format["expression"] = query
        format["assetUrl"] = getAssetUrl(query)
        format["googleImage"]=getGoogleImage(query)
        return format
    except Exception as e:
        print(e)
        return {"operation": "oops", "expression": expression}

In [7]:
def render(i, props, retry=0):
    # return i
    if retry > 5:
        return None
    try:
        response = requests.post(os.environ.get("rendererUrl"), json=dict(props=props))
        res = response.json()
        print(i, res)
        return res["renderId"]
    except Exception as e:
        print(e)
        time.sleep(10)
        print(f"Retrying for {i}")
        return render(i, props, retry + 1)

In [8]:
def worker(i, expression, speechMarks, speechMarksKey):
    print(f"starting worker {i}")
    print(expression)

    props = dict()

    props["speechMarks"] = speechMarks
    props["audioKey"] = speechMarksKey
    props["format"] = processExpression(i, cleanExpression(expression))
    print(props)
    sampleProps.append(dict(expression=expression, props=props))
    renderIds[f"{i}"] = render(i, props)

In [9]:
def getFromS3(speechMarksKey):
    bucket = "ai-tutor-s3"
    response = s3Client.get_object(
        Bucket=bucket, Key=f"speech_marks/{speechMarksKey}.jsonl"
    )
    data = []
    with io.BytesIO(response["Body"].read()) as file:
        with jsonlines.Reader(file) as reader:
            for line in reader:
                data.append(
                    dict(time=line["time"], type=line["type"], value=line["value"])
                )
    return data

In [10]:
def getSentenceSpeechMarks(speechMarks):
    speechMarks = speechMarks[1:]
    speechMarksBySentence = []
    time = 0
    idx = 0
    wordMarks = []
    for obj in speechMarks:
        if obj["type"] == "sentence":
            speechMarksBySentence.append(
                dict(
                    speechMarks=wordMarks,
                    duration=(obj["time"] - time) / 1000,
                    start=time / 1000,
                )
            )
            time = obj["time"]
            idx += 1
            wordMarks = []
        else:
            wordMarks.append(dict(time=obj["time"] - time, value=obj["value"]))
    speechMarksBySentence.append(
        dict(
            speechMarks=wordMarks, duration=(wordMarks[-1]["time"] + 1000 - time) / 1000
        )
    )
    return speechMarksBySentence

In [11]:
def master(sentences, expressions, speechMarksKey):
    speechMarks = getFromS3(speechMarksKey)
    speechMarksBySentence=getSentenceSpeechMarks(speechMarks)
    for i, sentence in enumerate(sentences):
        t = threading.Thread(
            target=worker,
            args=(
                i,
                expressions[i],
                speechMarksBySentence[i],
                speechMarksKey,
            ),
        )
        threads.append(t)
        t.start()
    for t in threads:
        t.join()
    output = []
    for i in range(len(sentences)):
        output.append(renderIds[f"{i}"])
    print(output)
    return output

In [12]:
def getSentences():
    sentences=[]
    with jsonlines.open(f'../sample_data/{os.environ.get("testKey")}.jsonl') as data:
        for obj in data:
            if obj['type']=='sentence':
                sentences.append(obj['value'])
    return sentences

In [13]:
sentences = getSentences()
with open(f"../sample_data/{os.environ.get('expressionsKey')}.json", "r") as f:
    expressions = json.load(f)

output=master(sentences, expressions, os.environ.get("testKey"))
with open(f"../sample_data/{os.environ.get('propsKey')}.json", "w") as fp:
    json.dump(sampleProps, fp)
with open(f"../sample_data/{os.environ.get('rendersKey')}.json", "w") as fp:
    json.dump(output, fp)

starting worker 0
Computer=Office.
starting worker 1
 CPU = office workers.
starting worker 2
 Workers “carry out” instructions.
starting worker 3
Cpu “processes” information.
starting worker 4
 Memory (RAM) = Worker’s desk space.
starting worker 5
 CPU “places” data.
starting worker 6
 Space “allows” task handling.
starting worker 7
 Storage = filing cabinets.
starting worker 8
 Storage 'stores' data.
starting worker 9
 Input devices= ”workers” receiving instructions.
starting worker 10
 Output devices = workers results board.
starting worker 11
 Operating System = Office Manager.
starting worker 12
 Project manager “coordinates” resources.
starting worker 13
Operating system “manages” ram & “prioritizes” tasks.
starting worker 14
 Programs & applications = Worker’s projects.
starting worker 15
 Program “consists of” instructions.
starting worker 16
Programs “need” resources.
starting worker 17
 Bus system = office communication network.
starting worker 18
Bus system “exchanges” data.