# Miscbook -- For Experimenting w Functions


In [1]:
import os
from qe import *

In [19]:
# Get Statistics from a Transcript (n people + n lines per person for a single file/transcript str)
def transcript_stats(script: str=None):
    # Case When script is path on disk
    if os.path.exists(script):
        file = open(script)
        data = file.read()
        data = data.split("\n")
    # Else if not a path, assume passed string *is* the script
    else:
        data = script.split("\n") #assume string for transcript follows sample data format
    
    # Remove LITERAL duplicate strs + get number of speakers and info
    data = set(data)
    if "" in data:
        data.remove("") #remove the empty line chars
    stats = {}

    for seq in data:
        speaker = seq.split(":")[0] #assuming that real data follows sample conventions
        if speaker not in stats:
            stats[speaker] = 0
        else: 
            stats[speaker] += 1

    stats["n_people"] = len(stats) #number of folks in transcription str
    return data, stats


# Get Unique Lines in Transcript
def transcript_lines(script: str):
    # Case When script is path on disk
    if os.path.exists(script):
        file = open(script)
        data = file.read()
        data = data.split("\n")
    # Else if not a path, assume passed string *is* the script
    else:
        data = script.split("\n") #assume string for transcript follows sample data format
    
    # Remove LITERAL duplicate strs + get number of speakers and info
    data = [i for i in data if i != ""]
    return data

# Get lines said by clients
def parse_client_lines(transcript_lines: list):
    speakers = {}
    for line in transcript_lines:
        speaker = line.split(":", 1)[0]
        if ("client" in speaker.lower()) and (":" in line):
            context = line.split(":", 1)[1]
            if speaker not in speakers:
                speakers[speaker] = []
                speakers[speaker].append(context)
            else: 
                speakers[speaker].append(context)

    return speakers


# TODO: CKG left off here, not fun
def naive_question_parser(text: str)-> bool:
    qwords = set(['what', 'where', 'when', 'how', 'why', 'did', 'do', 'does', 'have', 'has', 'am', 'is', 'are', 'can', 'could', 'may', 'would', 'will','should', "didn't", "doesn't", "haven't", "isn't", "aren't", "can't", "couldn't", "wouldn't", "won't", "shouldn't", '?'])
    text = set(text.split())
    overlap = qwords.intersection(text)
    return False if len(overlap) == 0 else True

naive_question_parser("what a lovely day today")

True

In [21]:
out = transcript_lines("transcripts/63a22ff0a39c8c8e9c303fde.txt")
out = parse_client_lines(out)

def parse_client_questions(client_trancsripts: dict):
    qcs = {client: [] for client in client_trancsripts}
    for client in client_trancsripts:
        for line in client_trancsripts[client]:
            result = get_questions_gpt(line)
            if result != "NO_QUESTIONS": 
                result = result.replace("NO_QUESTIONS", "")
                qcs[client].append(result)
    return qcs
# parse_client_lines(out)

In [5]:
# Run Transcription function on whole dir of files
for dirpath, dirnames, filenames in os.walk("transcripts"):
    file_paths = [os.path.join(dirpath, f) for f in filenames]
    res = []
    for script in file_paths:
        data, stats = transcript_stats(script)
        res.append(stats)
        print(f"{script} -- {stats}")


transcripts/637cf23dcc7845331578f507.txt -- {'Host 1': 66, 'Client 2|Client 2': 95, 'n_people': 2}
transcripts/6398e851a39c8c8e9c806e12.txt -- {'Client 2': 87, 'Host 1': 75, 'n_people': 2}
transcripts/6388be8ba39c8c8e9c3d5469.txt -- {'Client 2': 69, 'Host 1': 59, 'n_people': 2}
transcripts/63a074f3a39c8c8e9c0ebe2d.txt -- {'Tom Cruise': 3, 'Client 2': 2, 'n_people': 2}
transcripts/638a5796a39c8c8e9c5e73fd.txt -- {'Host 1': 125, 'Client 2': 103, 'n_people': 2}
transcripts/638783a0a39c8c8e9c23f060.txt -- {'Host 1': 70, 'Client 2': 67, 'n_people': 2}
transcripts/639a1d78a39c8c8e9c979c55.txt -- {'Client 3': 74, 'Host 1': 89, 'Host 4': 5, 'Host 2': 1, 'n_people': 4}
transcripts/63893628a39c8c8e9c4711d1.txt -- {'Client 2': 8, 'Host 1': 34, 'Client 3': 37, 'n_people': 3}
transcripts/63a09820a39c8c8e9c1155aa.txt -- {'Client 3': 61, 'Client 2': 55, 'Host 1': 26, 'n_people': 3}
transcripts/63b8a77da39c8c8e9cd89379.txt -- {'Client 2': 50, 'Host 1': 48, 'n_people': 2}
transcripts/639ba9b6a39c8c8e9c

In [2]:
# Test full sequence on a Script
lines = transcript_lines("transcripts/63b8a77da39c8c8e9cd89379.txt") #extract the lines from the transcript (expects a filepath or string)
client_lines = parse_client_lines(lines) #gets lines from script wherein a client spoke
relevant_questions = parse_client_questions(client_lines) #calls gpt-3 and retrieves relevant questions from the sequence of client text

relevant_questions

{'Client 2': ['Do you offer staff augmentation?\nDo you have a time to hire for a given search?\nHow long have you been focusing on staff augmentation?',
  'Do you help smaller companies with searches?',
  "Who's the ideal customer for you and the services you provide?",
  'Do you think there is opportunity for smaller boutique market consulting oriented services? \nWhat changes have you seen in the last six months? \n',
  'Do you have open vacancies for four months? \nWhat are you doing to get engineers in front of leaders to build trust? \nWhat is the current market like?',
  'Do you usually work on an engaged format with your clients? \nWhat type of fee structure do you work with? \nWhat is your superpower in the business? \nWhat are the metrics driven and stacking residential that you mentioned?',
  'Do you think the VCs have particular departments that I should be talking to?',
  'Do you have a strategy for revenue generation and acceleration? \nWhat block of sign are you trying t

In [19]:
a = ["t", "1", "tat", 'asdfadf', 'asdfasdfasdsafdsf', "testing"]

total_tokens = len(a)
n_steps = int(total_tokens/2)

for i in range(0, n_steps):
    low = i*2
    high = i+2
    print(a[low:high])
    print(low, high)
    



['t', '1']
0 2
['tat']
2 3
[]
4 4


In [24]:
my_list = ["t", "1", "tat", 'asdfadf', 'asdfasdfasdsafdsf', "testing", "lets hope"]
chunk_size = 6
o = list((my_list[i:i + chunk_size] for i in range(0, len(my_list), chunk_size)))
o



[['t', '1', 'tat', 'asdfadf', 'asdfasdfasdsafdsf', 'testing'], ['lets hope']]