diff --git a/apps/privategpt/ingest.py b/apps/privategpt/ingest.py index d8b539a326..e1265363fa 100644 --- a/apps/privategpt/ingest.py +++ b/apps/privategpt/ingest.py @@ -23,25 +23,23 @@ def load_data(source_folder_path: str): cursor = evadb.connect(path).cursor() # Drop function if it already exists - cursor.drop_function("embedding").execute() - + cursor.query("DROP FUNCTION IF EXISTS embedding;").execute() # Create function from Python file # This function is a sentence feature extractor - embedding_udf = cursor.create_function( - udf_name="embedding", - if_not_exists=True, - impl_path=f"{path}/udfs/sentence_feature_extractor.py", - ) - embedding_udf.execute() + text_feat_function_query = f"""CREATE FUNCTION IF NOT EXISTS embedding + IMPL '{path}/functions/sentence_feature_extractor.py'; + """ + print(text_feat_function_query) + cursor.query(text_feat_function_query).execute() print("๐Ÿงน Dropping existing tables in EvaDB") - cursor.drop_table("data_table").execute() - cursor.drop_table("embedding_table").execute() + cursor.query("DROP TABLE IF EXISTS data_table;").execute() + cursor.query("DROP TABLE IF EXISTS embedding_table;").execute() print("๐Ÿ“„ Loading PDFs into EvaDB") - cursor.load( - file_regex=f"{source_folder_path}/*.pdf", format="PDF", table_name="data_table" - ).execute() + text_load_query = f"""LOAD PDF '{source_folder_path}/*.pdf' INTO data_table;""" + print(text_load_query) + cursor.query(text_load_query).execute() print("๐Ÿค– Extracting Feature Embeddings. This may take some time ...") cursor.query( @@ -49,12 +47,13 @@ def load_data(source_folder_path: str): ).execute() print("๐Ÿ” Building FAISS Index ...") - cursor.create_vector_index( - index_name="embedding_index", - table_name="embedding_table", - expr="features", - using="FAISS", - ) + cursor.query( + """ + CREATE INDEX embedding_index + ON embedding_table (features) + USING FAISS; + """ + ).execute() def main(): diff --git a/apps/privategpt/privateGPT.py b/apps/privategpt/privateGPT.py index 8b1086a786..0ee9986140 100644 --- a/apps/privategpt/privateGPT.py +++ b/apps/privategpt/privateGPT.py @@ -23,13 +23,15 @@ def query(question): - context_docs = ( - cursor.table("embedding_table") - .order(f"Similarity(embedding('{question}'), features)") - .limit(3) - .select("data") - .df() - ) + context_docs = cursor.query( + f""" + SELECT data + FROM embedding_table + ORDER BY Similarity(embedding('{question}'), features) + ASC LIMIT 3; + """ + ).df() + # Merge all context information. context = "; \n".join(context_docs["embedding_table.data"]) @@ -51,8 +53,10 @@ def query(question): print("\n>> Context: ") print(context) + print( - "๐Ÿ”ฎ Welcome to EvaDB! Don't forget to run `python ingest.py` before running this file." + "๐Ÿ”ฎ Welcome to EvaDB! Don't forget to run `python ingest.py` before" + " running this file." ) ## Take input of queries from user in a loop diff --git a/apps/story_qa/evadb_qa.py b/apps/story_qa/evadb_qa.py index 3a5370afc5..083696244f 100644 --- a/apps/story_qa/evadb_qa.py +++ b/apps/story_qa/evadb_qa.py @@ -12,19 +12,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os from time import perf_counter from gpt4all import GPT4All from unidecode import unidecode -from util import download_story, read_text_line, try_execute +from util import download_story, read_text_line import evadb -def ask_question(path): +def ask_question(story_path: str): # Initialize early to exclude download time. llm = GPT4All("ggml-gpt4all-j-v1.3-groovy") + path = os.path.dirname(evadb.__file__) cursor = evadb.connect().cursor() story_table = "TablePPText" @@ -35,17 +37,17 @@ def ask_question(path): t_i = 0 timestamps[t_i] = perf_counter() - print("Setup UDF") + print("Setup Function") - Text_feat_udf_query = """CREATE UDF IF NOT EXISTS SentenceFeatureExtractor - IMPL 'evadb/udfs/sentence_feature_extractor.py'; + Text_feat_function_query = f"""CREATE FUNCTION IF NOT EXISTS SentenceFeatureExtractor + IMPL '{path}/functions/sentence_feature_extractor.py'; """ - cursor.query("DROP UDF IF EXISTS SentenceFeatureExtractor;").execute() - cursor.query(Text_feat_udf_query).execute() + cursor.query("DROP FUNCTION IF EXISTS SentenceFeatureExtractor;").execute() + cursor.query(Text_feat_function_query).execute() - try_execute(cursor, f"DROP TABLE IF EXISTS {story_table};") - try_execute(cursor, f"DROP TABLE IF EXISTS {story_feat_table};") + cursor.query(f"DROP TABLE IF EXISTS {story_table};").execute() + cursor.query(f"DROP TABLE IF EXISTS {story_feat_table};").execute() t_i = t_i + 1 timestamps[t_i] = perf_counter() @@ -56,7 +58,7 @@ def ask_question(path): cursor.query(f"CREATE TABLE {story_table} (id INTEGER, data TEXT(1000));").execute() # Insert text chunk by chunk. - for i, text in enumerate(read_text_line(path)): + for i, text in enumerate(read_text_line(story_path)): print("text: --" + text + "--") ascii_text = unidecode(text) cursor.query( @@ -84,7 +86,7 @@ def ask_question(path): # Create search index on extracted features. cursor.query( - f"CREATE INDEX {index_table} ON {story_feat_table} (features) USING FAISS;" + f"CREATE INDEX {index_table} ON {story_feat_table} (features) USING" " FAISS;" ).execute() t_i = t_i + 1 @@ -139,9 +141,9 @@ def ask_question(path): def main(): - path = download_story() + story_path = download_story() - ask_question(path) + ask_question(story_path) if __name__ == "__main__": diff --git a/apps/youtube_channel_qa/README.md b/apps/youtube_channel_qa/README.md index a2fa535050..70c96a7898 100644 --- a/apps/youtube_channel_qa/README.md +++ b/apps/youtube_channel_qa/README.md @@ -28,6 +28,6 @@ pip install -r requirements.txt ## Usage Run script: ```bat -python multi_youtube_video_qa.py +python youtube_channel_qa.py ``` diff --git a/apps/youtube_channel_qa/youtube_channel_qa.py b/apps/youtube_channel_qa/youtube_channel_qa.py index b457f22142..297cb93804 100644 --- a/apps/youtube_channel_qa/youtube_channel_qa.py +++ b/apps/youtube_channel_qa/youtube_channel_qa.py @@ -24,8 +24,10 @@ import evadb MAX_CHUNK_SIZE = 10000 -CHATGPT_UDF_PATH = "../../evadb/udfs/chatgpt.py" -SENTENCE_FEATURE_EXTRACTOR_UDF_PATH = "../../evadb/udfs/sentence_feature_extractor.py" +CHATGPT_FUNCTION_PATH = "../../evadb/functions/chatgpt.py" +SENTENCE_FEATURE_EXTRACTOR_FUNCTION_PATH = ( + "../../evadb/functions/sentence_feature_extractor.py" +) QUESTIONS_PATH = "./questions.txt" YT_VIDEO_IDS_PATH = "./yt_video_ids.txt" @@ -105,7 +107,7 @@ def download_youtube_video_transcript(video_link: str): time_taken = time.time() - start total_transcription_time += time_taken - print(f"โœ… Video transcript downloaded successfully in {time_taken} seconds \n") + print("โœ… Video transcript downloaded successfully in" f" {time_taken} seconds \n") return transcript @@ -143,29 +145,30 @@ def generate_online_video_transcript(cursor) -> str: print("Analyzing videos. This may take a while...") start = time.time() - # bootstrap speech analyzer udf and chatgpt udf for analysis - args = {"task": "automatic-speech-recognition", "model": "openai/whisper-base"} - speech_analyzer_udf_rel = cursor.create_function( - "SpeechRecognizer", type="HuggingFace", **args - ) - speech_analyzer_udf_rel.execute() + # bootstrap speech analyzer function and chatgpt function for analysis + speech_analyzer_function_query = """ + CREATE FUNCTION SpeechRecognizer + TYPE HuggingFace + TASK 'automatic-speech-recognition' + MODEL 'openai/whisper-base'; + """ + cursor.query(speech_analyzer_function_query).execute() # load youtube video into an evadb table - cursor.drop_table("youtube_video", if_exists=True).execute() - cursor.load("*.mp4", "youtube_video", "video").execute() + cursor.query("DROP TABLE IF EXISTS youtube_video;").execute() + cursor.query("LOAD VIDEO '*.mp4' INTO youtube_video;").execute() # extract speech texts from videos cursor.query( - "CREATE TABLE IF NOT EXISTS youtube_video_text AS SELECT SpeechRecognizer(audio) FROM youtube_video;" + "CREATE TABLE IF NOT EXISTS youtube_video_text AS SELECT" + " SpeechRecognizer(audio) FROM youtube_video;" ).execute() print(f"Video transcript generated in {time.time() - start} seconds.") total_transcription_time += time.time() - start - raw_transcript_string = ( - cursor.table("youtube_video_text") - .select("text") - .df()["youtube_video_text.text"][0] - ) + raw_transcript_string = cursor.query("SELECT text FROM youtube_video_text;").df()[ + "youtube_video_text.text" + ][0] return raw_transcript_string @@ -182,7 +185,7 @@ def generate_response(cursor: evadb.EvaDBCursor, question: str) -> str: # instead of passing all the documents to the LLM, we first do a # semantic search over the embeddings and get the most relevant rows. - cursor.drop_table("EMBED_TEXT", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS EMBED_TEXT;").execute() text_summarization_query = f""" CREATE TABLE EMBED_TEXT AS SELECT text FROM embedding_table @@ -193,9 +196,12 @@ def generate_response(cursor: evadb.EvaDBCursor, question: str) -> str: cursor.query(text_summarization_query).execute() start = time.time() - prompt = "Answer the questions based on context alone. Do no generate responses on your own." - generate_chatgpt_response_rel = cursor.table("EMBED_TEXT").select( - f"ChatGPT('{question}', text, '{prompt}')" + prompt = ( + "Answer the questions based on context alone. Do no generate responses" + " on your own." + ) + generate_chatgpt_response_rel = cursor.query( + f"SELECT ChatGPT('{question}', text, '{prompt}') FROM EMBED_TEXT;" ) responses = generate_chatgpt_response_rel.df()["chatgpt.response"] print(f"Answer (generated in {time.time() - start} seconds):") @@ -212,14 +218,16 @@ def cleanup(): if __name__ == "__main__": print( - "๐Ÿ”ฎ Welcome to EvaDB! This app lets you ask questions on any YouTube channel.\n\n" + "๐Ÿ”ฎ Welcome to EvaDB! This app lets you ask questions on any YouTube" + " channel.\n\n" ) yt_video_ids = [] # get Youtube video url channel_name = str( input( - "๐Ÿ“บ Enter the Channel Name (press Enter to use our default Youtube Channel) : " + "๐Ÿ“บ Enter the Channel Name (press Enter to use our default Youtube" + " Channel) : " ) ) @@ -227,7 +235,8 @@ def cleanup(): channel_name = DEFAULT_CHANNEL_NAME limit = input( - "Enter the number of videos to download (press Enter to download one video) : " + "Enter the number of videos to download (press Enter to download one" + " video) : " ) if limit == "": @@ -237,7 +246,9 @@ def cleanup(): sort_by = str( input( - "Enter the order in which to retrieve the videos (Either 'newest' / 'oldest' / 'popular'). Press Enter to go with 'popular' option : " + "Enter the order in which to retrieve the videos (Either 'newest'" + " / 'oldest' / 'popular'). Press Enter to go with 'popular'" + " option : " ) ).lower() @@ -276,7 +287,8 @@ def cleanup(): except Exception as e: print(e) print( - "โ—๏ธ Failed to download video transcript. Will try downloading video and generating transcript later... \n\n" + "โ—๏ธ Failed to download video transcript. Will try downloading" + " video and generating transcript later... \n\n" ) failed_download_links.append(yt_url) continue @@ -313,7 +325,8 @@ def cleanup(): mp4_files = [file for file in files if file.endswith(".mp4")] if not mp4_files: print( - "No mp4 files found in current directory. Not generating video transcripts ..." + "No mp4 files found in current directory. Not generating video" + " transcripts ..." ) else: raw_transcript_string = generate_online_video_transcript(cursor) @@ -329,25 +342,25 @@ def cleanup(): load_start_time = time.time() # load chunked transcript into table - cursor.drop_table("Transcript", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS Transcript;").execute() cursor.query( """CREATE TABLE IF NOT EXISTS Transcript (text TEXT(100));""" ).execute() - cursor.load("transcript.csv", "Transcript", "csv").execute() + cursor.query("LOAD CSV 'transcript.csv' INTO Transcript;").execute() print( - f"Loading transcripts into DB took {time.time() - load_start_time} seconds" + "Loading transcripts into DB took" + f" {time.time() - load_start_time} seconds" ) print("Creating embeddings and Vector Index") - cursor.drop_function("embedding", if_exists=True).execute() - cursor.create_function( - "embedding", - if_not_exists=True, - impl_path=SENTENCE_FEATURE_EXTRACTOR_UDF_PATH, + cursor.query("DROP FUNCTION IF EXISTS embedding;").execute() + cursor.query( + "CREATE FUNCTION IF NOT EXISTS embedding IMPL" + f" '{SENTENCE_FEATURE_EXTRACTOR_FUNCTION_PATH}';" ).execute() - cursor.drop_table("embedding_table", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS embedding_table;").execute() est = time.time() cursor.query( """CREATE TABLE embedding_table AS @@ -358,13 +371,13 @@ def cleanup(): print(f"Creating embeddings took {eft - est} seconds") # Create search index on extracted features. - cursor.create_vector_index( - index_name="faiss_index", - table_name="embedding_table", - expr="features", - using="FAISS", - ).df() - + cursor.query( + """ + CREATE INDEX faiss_index + ON embedding_table (features) + USING FAISS; + """ + ).execute() vet = time.time() print(f"Creating index took {vet - eft} seconds") @@ -376,7 +389,8 @@ def cleanup(): print(question) generate_response(cursor, question) print( - "Total time taken in answering all questions = ", str(time.time() - st) + "Total time taken in answering all questions = ", + str(time.time() - st), ) else: # Enter a QA Loop. ready = True @@ -385,7 +399,7 @@ def cleanup(): if question.lower() == "exit": ready = False else: - # Generate response with chatgpt udf + # Generate response with chatgpt function print("โณ Generating response (may take a while)...") generate_response(cursor, question) cleanup() diff --git a/apps/youtube_qa/youtube_qa.py b/apps/youtube_qa/youtube_qa.py index a26d100b2f..5a56bbe29d 100644 --- a/apps/youtube_qa/youtube_qa.py +++ b/apps/youtube_qa/youtube_qa.py @@ -55,11 +55,14 @@ def receive_user_input() -> Dict: user_input (dict): global configurations """ print( - "๐Ÿ”ฎ Welcome to EvaDB! This app lets you ask questions on any local or YouTube online video.\nYou will only need to supply a Youtube URL and an OpenAI API key.\n" + "๐Ÿ”ฎ Welcome to EvaDB! This app lets you ask questions on any local or" + " YouTube online video.\nYou will only need to supply a Youtube URL" + " and an OpenAI API key.\n" ) from_youtube = str( input( - "๐Ÿ“น Are you querying an online Youtube video or a local video? ('yes' for online/ 'no' for local): " + "๐Ÿ“น Are you querying an online Youtube video or a local video?" + " ('yes' for online/ 'no' for local): " ) ).lower() in ["y", "yes"] user_input = {"from_youtube": from_youtube} @@ -68,7 +71,8 @@ def receive_user_input() -> Dict: # get Youtube video url video_link = str( input( - "๐ŸŒ Enter the URL of the YouTube video (press Enter to use our default Youtube video URL): " + "๐ŸŒ Enter the URL of the YouTube video (press Enter to use our" + " default Youtube video URL): " ) ) @@ -78,7 +82,8 @@ def receive_user_input() -> Dict: else: video_local_path = str( input( - "๐Ÿ’ฝ Enter the local path to your video (press Enter to use our demo video): " + "๐Ÿ’ฝ Enter the local path to your video (press Enter to use our" + " demo video): " ) ) @@ -211,21 +216,19 @@ def generate_online_video_transcript(cursor: evadb.EvaDBCursor) -> str: print("\nโณ Analyzing YouTube video. This may take a while...") # load youtube video into an evadb table - cursor.drop_table("youtube_video", if_exists=True).execute() - cursor.load(ONLINE_VIDEO_PATH, "youtube_video", "video").execute() - + cursor.query("DROP TABLE IF EXISTS youtube_video;").execute() + cursor.query(f"LOAD VIDEO '{ONLINE_VIDEO_PATH}' INTO youtube_video;").execute() # extract speech texts from videos - cursor.drop_table("youtube_video_text", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS youtube_video_text;").execute() cursor.query( - "CREATE TABLE IF NOT EXISTS youtube_video_text AS SELECT SpeechRecognizer(audio) FROM youtube_video;" + "CREATE TABLE IF NOT EXISTS youtube_video_text AS SELECT" + " SpeechRecognizer(audio) FROM youtube_video;" ).execute() print("โœ… Video analysis completed.") - raw_transcript_string = ( - cursor.table("youtube_video_text") - .select("text") - .df()["youtube_video_text.text"][0] - ) + raw_transcript_string = cursor.query("SELECT text FROM youtube_video_text;").df()[ + "youtube_video_text.text" + ][0] return raw_transcript_string @@ -239,23 +242,24 @@ def generate_local_video_transcript(cursor: evadb.EvaDBCursor, video_path: str) Returns: str: video transcript text. """ - print(f"\nโณ Analyzing local video from {video_path}. This may take a while...") + print(f"\nโณ Analyzing local video from {video_path}. This may take a" " while...") # load youtube video into an evadb table - cursor.drop_table("local_video", if_exists=True).execute() - cursor.load(video_path, "local_video", "video").execute() + cursor.query("DROP TABLE IF EXISTS local_video;").execute() + cursor.query(f"LOAD VIDEO '{video_path}' INTO local_video;").execute() # extract speech texts from videos - cursor.drop_table("local_video_text", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS local_video_text;").execute() cursor.query( - "CREATE TABLE IF NOT EXISTS local_video_text AS SELECT SpeechRecognizer(audio) FROM local_video;" + "CREATE TABLE IF NOT EXISTS local_video_text AS SELECT" + " SpeechRecognizer(audio) FROM local_video;" ).execute() print("โœ… Video analysis completed.") # retrieve generated transcript - raw_transcript_string = ( - cursor.table("local_video_text").select("text").df()["local_video_text.text"][0] - ) + raw_transcript_string = cursor.query("SELECT text FROM local_video_text;").df()[ + "local_video_text.text" + ][0] return raw_transcript_string @@ -265,24 +269,25 @@ def generate_summary(cursor: evadb.EvaDBCursor): Args: cursor (EVADBCursor): evadb api cursor. """ - transcript_list = cursor.table("Transcript").select("text").df()["transcript.text"] + transcript_list = cursor.query("SELECT text FROM Transcript;").df()[ + "transcript.text" + ] if len(transcript_list) == 1: summary = transcript_list[0] df = pd.DataFrame([{"summary": summary}]) df.to_csv(SUMMARY_PATH) - cursor.drop_table("Summary", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS Summary;").execute() cursor.query( """CREATE TABLE IF NOT EXISTS Summary (summary TEXT(100));""" ).execute() - cursor.load(SUMMARY_PATH, "Summary", "csv").execute() + cursor.query(f"LOAD CSV '{SUMMARY_PATH}' INTO Summary;").execute() return - generate_summary_rel = cursor.table("Transcript").select( - "ChatGPT('summarize the video in detail', text)" + generate_summary_text_query = ( + "SELECT ChatGPT('summarize the video in detail', text) FROM" " Transcript;" ) - responses = generate_summary_rel.df()["chatgpt.response"] - + responses = cursor.query(generate_summary_text_query).df()["chatgpt.response"] summary = "" for r in responses: summary += f"{r} \n" @@ -296,16 +301,16 @@ def generate_summary(cursor: evadb.EvaDBCursor): df = pd.DataFrame([{"summary": partitioned_summary}]) df.to_csv(SUMMARY_PATH) - cursor.drop_table("Summary", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS Summary;").execute() cursor.query( """CREATE TABLE IF NOT EXISTS Summary (summary TEXT(100));""" ).execute() - cursor.load(SUMMARY_PATH, "Summary", "csv").execute() + cursor.query(f"LOAD CSV '{SUMMARY_PATH}' INTO Summary;").execute() - generate_summary_rel = cursor.table("Summary").select( - "ChatGPT('summarize in detail', summary)" + generate_summary_text_query = ( + "SELECT ChatGPT('summarize in detail', summary) FROM Summary;" ) - responses = generate_summary_rel.df()["chatgpt.response"] + responses = cursor.query(generate_summary_text_query).df()["chatgpt.response"] summary = " ".join(responses) # no further summarization is needed if the summary is short enough @@ -313,11 +318,11 @@ def generate_summary(cursor: evadb.EvaDBCursor): need_to_summarize = False # load final summary to table - cursor.drop_table("Summary", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS Summary;").execute() cursor.query( """CREATE TABLE IF NOT EXISTS Summary (summary TEXT(100));""" ).execute() - cursor.load(SUMMARY_PATH, "Summary", "csv").execute() + cursor.query(f"LOAD CSV '{SUMMARY_PATH}' INTO Summary;").execute() def generate_response(cursor: evadb.EvaDBCursor, question: str) -> str: @@ -331,21 +336,17 @@ def generate_response(cursor: evadb.EvaDBCursor, question: str) -> str: str: response from llm. """ # generate summary - if len(cursor.table("Transcript").select("text").df()["transcript.text"]) == 1: - return ( - cursor.table("Transcript") - .select(f"ChatGPT('{question}', text)") - .df()["chatgpt.response"][0] - ) + if len(cursor.query("SELECT text FROM Transcript;").df()["transcript.text"]) == 1: + return cursor.query( + f"SELECT ChatGPT('{question}', text) FROM Transcript;" + ).df()["chatgpt.response"][0] else: if not os.path.exists(SUMMARY_PATH): generate_summary(cursor) - return ( - cursor.table("Summary") - .select(f"ChatGPT('{question}', summary)") - .df()["chatgpt.response"][0] - ) + return cursor.query( + f"SELECT ChatGPT('{question}', summary) FROM Summary;" + ).df()["chatgpt.response"][0] def generate_blog_sections(cursor: evadb.EvaDBCursor) -> List: @@ -358,12 +359,12 @@ def generate_blog_sections(cursor: evadb.EvaDBCursor) -> List: List: list of blog sections """ sections_query = ( - "list 7 logical sections of a blog post from the transcript as a python list" + "list 7 logical sections of a blog post from the transcript as a" " python list" ) sections_string = str( - cursor.table("Summary") - .select(f"ChatGPT('{sections_query}', summary)") - .df()["chatgpt.response"][0] + cursor.query(f"SELECT ChatGPT('{sections_query}', summary) FROM Summary;").df()[ + "chatgpt.response" + ][0] ) begin = sections_string.find("[") end = sections_string.find("]") @@ -389,7 +390,9 @@ def generate_blog_post(cursor: evadb.EvaDBCursor): """ to_generate = str( - input("\nWould you like to generate a blog post based on the video? (yes/no): ") + input( + "\nWould you like to generate a blog post based on the video?" " (yes/no): " + ) ) if to_generate.lower() == "yes" or to_generate.lower() == "y": print("โณ Generating blog post (may take a while)...") @@ -401,28 +404,36 @@ def generate_blog_post(cursor: evadb.EvaDBCursor): sections = generate_blog_sections(cursor) title_query = "generate a creative title of a blog post from the transcript" - generate_title_rel = cursor.table("Summary").select( - f"ChatGPT('{title_query}', summary)" + generate_title_rel = cursor.query( + f"SELECT ChatGPT('{title_query}', summary) FROM Summary;" ) blog = "# " + generate_title_rel.df()["chatgpt.response"][0].replace('"', "") i = 1 for section in sections: - print(f"--โณ Generating body ({i}/{len(sections)})...") + print(f"--โณ Generating body ({i}/{len(sections)}) titled" f" {section}...") if "introduction" in section.lower(): section_query = f"write a section about {section} from transcript" - section_prompt = "generate response in markdown format and highlight important technical terms with hyperlinks" + section_prompt = ( + "generate response in markdown format and highlight" + " important technical terms with hyperlinks" + ) elif "conclusion" in section.lower(): section_query = "write a creative conclusion from transcript" section_prompt = "generate response in markdown format" else: section_query = ( - f"write a single detailed section about {section} from transcript" + "write a single detailed section about" + f" {section} from transcript" + ) + section_prompt = ( + "generate response in markdown format with information" + " from the internet" ) - section_prompt = "generate response in markdown format with information from the internet" - generate_section_rel = cursor.table("Summary").select( - f"ChatGPT('{section_query}', summary, '{section_prompt}')" + generate_section_rel = cursor.query( + f"SELECT ChatGPT('{section_query}', summary," + f" '{section_prompt}') FROM Summary;" ) generated_section = generate_section_rel.df()["chatgpt.response"][0] @@ -431,12 +442,13 @@ def generate_blog_post(cursor: evadb.EvaDBCursor): i += 1 source_query = ( - "generate a short list of keywords for the transcript with hyperlinks" + "generate a short list of keywords for the transcript with" " hyperlinks" ) source_prompt = "generate response in markdown format" print("--โณ Wrapping up...") - generate_source_rel = cursor.table("Summary").select( - f"ChatGPT('{source_query}', summary, '{source_prompt}')" + generate_source_rel = cursor.query( + f"SELECT ChatGPT('{source_query}', summary," + f" '{source_prompt}') FROM Summary;" ) blog += "\n## Sources\n" + generate_source_rel.df()["chatgpt.response"][0] print(blog) @@ -468,7 +480,8 @@ def cleanup(): except Exception as e: print(e) print( - "Failed to download video transcript. Downloading video and generate transcript from video instead..." + "Failed to download video transcript. Downloading video and" + " generate transcript from video instead..." ) try: @@ -479,15 +492,15 @@ def cleanup(): if transcript is not None: raw_transcript_string = group_transcript(transcript) else: - # create speech recognizer UDF from HuggingFace - args = { - "task": "automatic-speech-recognition", - "model": "openai/whisper-base", - } - speech_analyzer_udf_rel = cursor.create_function( - "SpeechRecognizer", type="HuggingFace", **args - ) - speech_analyzer_udf_rel.execute() + # create speech recognizer function from HuggingFace + + speech_analyzer_function_query = """ + CREATE FUNCTION SpeechRecognizer + TYPE HuggingFace + TASK 'automatic-speech-recognition' + MODEL 'openai/whisper-base'; + """ + cursor.query(speech_analyzer_function_query).execute() if user_input["from_youtube"]: # download youtube video online if the video disabled transcript @@ -508,11 +521,11 @@ def cleanup(): df.to_csv(TRANSCRIPT_PATH) # load chunked transcript into table - cursor.drop_table("Transcript", if_exists=True).execute() + cursor.query("DROP TABLE IF EXISTS Transcript;").execute() cursor.query( """CREATE TABLE IF NOT EXISTS Transcript (text TEXT(50));""" ).execute() - cursor.load(TRANSCRIPT_PATH, "Transcript", "csv").execute() + cursor.query(f"LOAD CSV '{TRANSCRIPT_PATH}' INTO Transcript;").execute() print("===========================================") print("๐Ÿช„ Ask anything about the video!") diff --git a/script/formatting/formatter.py b/script/formatting/formatter.py index a7b5e36ad0..885d0fb0f4 100755 --- a/script/formatting/formatter.py +++ b/script/formatting/formatter.py @@ -65,6 +65,13 @@ def wrapped(*args, **kwargs): DEFAULT_DIRS.append(EvaDB_APPS_DIR) IGNORE_FILES = ["version.py"] +IGNORE_PRINT_FILES = [ + "apps/privategpt/privateGPT.py", + "apps/privategpt/ingest.py", + "apps/story_qa/evadb_qa.py", + "apps/youtube_qa/youtube_qa.py", + "apps/youtube_channel_qa/youtube_channel_qa.py", +] FLAKE8_VERSION_REQUIRED = "3.9.1" BLACK_VERSION_REQUIRED = "22.6.0" @@ -228,7 +235,7 @@ def format_file(file_path, add_header, strip_header, format_code): # CHECK FOR INVALID WORDS (like print) with open(file_path, 'r') as file: for line_num, line in enumerate(file, start=1): - if ' print(' in line: + if file_path not in IGNORE_PRINT_FILES and ' print(' in line: LOG.warning(f"print() found in {file_path}, line {line_num}: {line.strip()}") sys.exit(1)