In [2]:
# import dependencies
import os
import numpy as np
import pandas as pd
import torch
import openai
from tqdm import tqdm
import re
import json

from sentence_transformers import SentenceTransformer
from youtube_transcript_api import YouTubeTranscriptApi

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
openai.api_key = "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"
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [4]:
def get_text(video_id):
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = []
    for item in transcript:
        text.append(item["text"])
    joined_text = " ".join(text).replace("\n", " ")
    return joined_text

In [5]:
def get_steps(text, additional_prompt):
    response = openai.Completion.create(
        model="text-davinci-003",
        prompt=f"Identify and extract the steps from this instructional video transcript. {additional_prompt}\n\nTranscript: {text}\n\nSteps:",
        temperature=0.6,
        max_tokens=3000,
        top_p=1,
        frequency_penalty=1,
        presence_penalty=1
    )
    return response

In [6]:
additional_prompts = [
    "",
    "Try to break down the steps into simple components if possible.",
    "Try to be as concise as possible.",
    "Try to be concise.",
    "Try to explain in the simplest steps possible."
]

In [20]:
def extract_steps(input, additional_prompt="", split=True, transcript=False):
    if transcript == False:
        text = get_text(input)
    else:
        text = input
    steps = get_steps(text, additional_prompt)
    if split:
        return steps["choices"][0]["text"].split("\n")
    else:
        return steps["choices"][0]["text"]

In [82]:
transcripts_available = []
video_names = []
directory = '/shared/medhini/WikiHow/how_to_steps'

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    file = open(f)
    # checking if it is a file
    data = json.load(file)
    url = data["video_url"]
    try:
        id = re.findall(r".*\/embed\/(.+)\?.*", url)[0]
        YouTubeTranscriptApi.get_transcript(id)
        transcripts_available.append(id)
    except:
        pass

In [20]:
video_names = list(pd.read_csv("video_names.csv").iloc[:, 1])
video_ids = list(pd.read_csv("available_captions.csv").iloc[:, 1])
videos = list(zip(video_ids, video_names))

In [23]:
test = videos[0]
test

('7M8OA9RuFaM',
 '/shared/medhini/WikiHow/how_to_steps/Clean-a-Cast-Iron-Skillet.json')

In [8]:
def read_text(filepath):
    file = open(filepath)
    data = json.load(file)
    text = []
    for item in data.values():
        try:
            text.append(item["text"])
        except:
            pass
    return text

In [21]:
def generate_variants(video_id, additional_prompts, transcript=False):
    results = []
    for prompt in additional_prompts:
        results.append(extract_steps(video_id, prompt, split=False, transcript=transcript))
    return results

In [30]:
def format_results(results):
    formatted_result = []
    for result in results:
        stripped = re.split("(\\n\d+\.\s)", result)[1:]
        formatted_result.append(stripped)
    return formatted_result

In [36]:
steps_true = read_text(test[1])
steps_true

['Rinse the skillet out with hot water after cooking.',
 'Bring the water to boil.',
 'With a wide spatula, lightly scrape the bottom and sides of the pan to dislodge any remaining food scraps.',
 'Dump the dirty water in the sink.',
 'Wet a couple paper towels and quickly wipe the surface of the skillet.',
 'Apply a thin coating of fat, such as vegetable oil or shortening, to the surface of the skillet.',
 'Store in a cool, dry place.',
 'Cut a raw potato in half or lengthwise, depending on the size of your skillet.',
 'Apply a thin layer of baking soda to the bottom of your potato.',
 'Scrub the skillet with the potato and baking soda, taking extra care to clean the problem areas.',
 'Season your pan after cleaning',
 'Avoid using soaps and detergents.',
 'Never put the skillet in the dishwasher.',
 'Try to avoid using steel wool to clean your cast iron cookery unless in dire need of cleaning.']

In [68]:
steps_pred = generate_variants(test[0], additional_prompts=additional_prompts)

In [69]:
formatted_steps_pred = format_results(steps_pred)

[['Wipe down cast iron with hot water and a cloth; scrub off food stuck on it with salt if necessary. ',
  'Dry cast iron completely to prevent rusting. ',
  'Scrub off any rust with white vinegar and a sponge if necessary. ',
  'Re-season cast iron after each use by coating it in vegetable oil to prevent food from sticking next time you use it. ',
  'Rinse out skillet with hot water, pour more water into the skillet and bring to a boil for several minutes, lightly scrape bottom and sides of skillet with spatula to dislodge remaining food scraps, turn off burner on stove and carefully pour dirty water into sink while holding skillet with oven mitts or cloth; wipe surface of skillet quickly with paper towels and apply thin coating of vegetable oil to the skillet'],
 ['Wipe down cast iron with hot water and cloth',
  'Scrub off any food stuck on it with salt ',
  'Dry cast iron completely to prevent rusting ',
  'If rust forms, scrub it off with white vinegar and a sponge ',
  'Re-season

In [95]:
def encode_steps(steps, multiple=False):
    encoded = []
    if multiple == False:
        for step in steps:
            encoded.append(model.encode(step))
    else:
        for step_list in steps:
            for step_item in step_list:
                encoded.append(model.encode(step_item))
    return encoded

In [96]:
steps_true_enc = encode_steps(steps_true)
steps_preds_enc = encode_steps(formatted_steps_pred)

In [107]:
steps_true

['Rinse the skillet out with hot water after cooking.',
 'Bring the water to boil.',
 'With a wide spatula, lightly scrape the bottom and sides of the pan to dislodge any remaining food scraps.',
 'Dump the dirty water in the sink.',
 'Wet a couple paper towels and quickly wipe the surface of the skillet.',
 'Apply a thin coating of fat, such as vegetable oil or shortening, to the surface of the skillet.',
 'Store in a cool, dry place.',
 'Cut a raw potato in half or lengthwise, depending on the size of your skillet.',
 'Apply a thin layer of baking soda to the bottom of your potato.',
 'Scrub the skillet with the potato and baking soda, taking extra care to clean the problem areas.',
 'Season your pan after cleaning',
 'Avoid using soaps and detergents.',
 'Never put the skillet in the dishwasher.',
 'Try to avoid using steel wool to clean your cast iron cookery unless in dire need of cleaning.']

In [126]:
combined_steps_true = "".join(steps_true)
combined_true_enc = model.encode(combined_steps_true)

In [129]:
combined_preds_enc = []
for variant in formatted_steps_pred:
    variant_steps_preds = "".join(variant)
    combined_preds_enc.append(model.encode(variant_steps_preds))

In [130]:
similarities = []
for variant_enc in combined_preds_enc:
    similarities.append(variant_enc @ combined_true_enc)
similarities

[13.484863, 16.015417, 18.408833, 16.416538, 14.222578]

# Test On Generated Output Steps

In [11]:
outputs_dir = "/shared/sanjayss/howto100m/"
output_steps = []
for filename in os.listdir(outputs_dir):
    if "aligned_subset_steps" in filename:
        output_steps.append(filename)
output_steps

['aligned_subset_steps_babbage_dtw15.jsonl',
 'aligned_subset_steps_babbage_dtw15_32000.jsonl',
 'aligned_subset_steps_babbage_52000.jsonl',
 'aligned_subset_steps_babbage_end.jsonl',
 'aligned_subset_steps_babbage_dtw15_64000.jsonl',
 'aligned_subset_steps_babbage.jsonl',
 'aligned_subset_steps_babbage_dtw15_corrected.jsonl',
 'aligned_subset_steps_babbage_dtw15_48000.jsonl',
 'aligned_subset_steps_babbage_dtw15_end.jsonl']

In [14]:
aligned_steps_f = os.path.join(outputs_dir, "aligned_subset_steps_babbage_dtw15.jsonl")
file = open(aligned_steps_f, "r", encoding="utf-8")
aligned_steps_output = []
i = 0
while i < 2:
    for line in file:
        aligned_steps_output.append(json.loads(line.rstrip('\n|\r')))
        i += 1

In [35]:
aligned_steps_output[3]

{'video_id': '-COPJISGW-I',
 'steps': ["Check the battery's health by examining it physically.",
  'If the battery is in a weak or failing condition, it should be discarded.',
  'If you rely on a hot spot battery, keep it in a cool place and check it regularly.'],
 'segments': {'3': [60, 27, 107, 6],
  '2': [43, 18, 60, 25],
  '1': [0, 0, 43, 16]}}

In [36]:
def compare_steps(i):
    true_aligned_steps = aligned_steps_output[i]["steps"]
    steps_pred = generate_variants(aligned_steps_output[i]["transcript"], additional_prompts=additional_prompts, transcript=True)
    formatted_steps_pred = format_results(steps_pred)
    return true_aligned_steps, steps_pred, formatted_steps_pred

In [None]:
test1 = compare_steps(1)

In [40]:
test3 = compare_steps(3)

In [39]:
test4 = compare_steps(4)