In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
root_directory = '/content/drive/MyDrive'
import os


In [63]:
loc_map_dict = dict()

## Fetch All the Files

In [64]:
# Walk through all directories and subdirectories
from collections import defaultdict

all_files = defaultdict(list)

for dirpath, dirnames, filenames in os.walk(root_directory):
    print(f"Current directory: {dirpath}")

    # List the files in the current directory
    for filename in filenames:
        file_path = os.path.join(dirpath, filename)
        # print(f"File: {file_path}")
        file_type = file_path.split('.')[-1]
        all_files[file_type].append(file_path)


Current directory: /content/drive/MyDrive
Current directory: /content/drive/MyDrive/Dwarka (1)
Current directory: /content/drive/MyDrive/Colab Notebooks
Current directory: /content/drive/MyDrive/Dwarka
Current directory: /content/drive/MyDrive/transcriptions
Current directory: /content/drive/MyDrive/log_dir
Current directory: /content/drive/MyDrive/log_dir/audio_csv


In [65]:
all_files.keys()

dict_keys(['gdoc', 'docx', 'pdf', 'gslides', 'pptx', 'ipynb', 'png', '/content/drive/MyDrive/Untitled', 'gsheet', 'csv', 'xlsx', 'xls', 'mp3', 'jpg', 'mp4', 'jpeg', 'webp', '/content/drive/MyDrive/Dwarka (1)/Audio from Sai Krishna', '/content/drive/MyDrive/Dwarka/Audio from Sai Krishna', 'txt', 'wav'])

In [66]:
for file_ext in ['pdf', 'mp3', 'docx']:
    print(f"file type: {file_ext} has {len(all_files[file_ext])} files")

file type: pdf has 7 files
file type: mp3 has 2 files
file type: docx has 11 files


### Extract the text from mp3 file

#### Gather the requirements

In [67]:
mp3_train = False
print("Do you want to fetch and train mp3 files too?")
print("It will take a lot of time. You can skip it for now")
if 'y' in input("Press 'y' to extract audio files else press 'n'").lower():
  mp3_train = True

Do you want to fetch and train mp3 files too?
It will take a lot of time. You can skip it for now
Press 'y' to extract audio files else press 'n'n


In [68]:
mp3_train


False

In [69]:
if mp3_train:
  !git clone https://github.com/ggerganov/whisper.cpp.git
  %cd /content/whisper.cpp/models
  !bash download-ggml-model.sh base.en
  %cd ..
  !make
  !pip install pydub
  !apt install ffmpeg
else:
  print("You need to install all the dependencies later")

You need to install all the dependencies later


```python  

!git clone https://github.com/ggerganov/whisper.cpp.git  
%cd /content/whisper.cpp/models  
!bash download-ggml-model.sh base.en  
%cd ..  
!make   
```

```python  
!pip install pydub  
!apt install ffmpeg  
```

#### UTILS functions

> Imports

In [70]:
import os
from pydub import AudioSegment
import csv
import subprocess

> mp3 -> wav

In [71]:
def convert_to_wav(file_path, audio_log_dir):
    # Split the file path into base and extension
    base, ext = os.path.splitext(file_path)
    # Only process files that aren't already in wav format
    if ext.lower() != ".wav":
        # Load the audio file
        audio = AudioSegment.from_file(file_path)
        # Set frame rate to 16 kHz
        audio = audio.set_frame_rate(16000)
        # Save as wav
        wav_file_path = audio_log_dir + base.split('/')[-1] + ".wav"
        audio.export(wav_file_path, format="wav")
        print(f"Saved as {wav_file_path}")
    else:
        print(f"File is already in wav format: {file_path}")
    return wav_file_path

> wav -> text (using whisper)

In [72]:
def transcript_wav_totext_using_whisper(wav_file_path):
    base, ext = os.path.splitext(wav_file_path)
    command = f"./main -f '{wav_file_path}' >  {base}.txt"
    print(command)
    process = subprocess.run(command, shell=True, check=True)
    text_file_path = f'{base}.txt'
    return text_file_path

> text -> csv

In [73]:
def generate_transcription_csv(text_file_path):
    csv_file_path = text_file_path.replace('.txt', '.csv')
    with open(text_file_path, 'r') as txt_file, open(csv_file_path, 'w') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['time_stamp', 'transcription'])  # Writing header
        for line in txt_file:
            line = line.strip()
            # Skip empty lines
            if line == '':
                continue
            # Split the line into timestamp and transcription
            time_stamp, transcription = line.split(']', 1)
            # Remove the leading '[' from the timestamp
            time_stamp = time_stamp[1:]
            # Remove leading and trailing spaces from the transcription
            transcription = transcription.strip()
            # Skip lines where the transcription is enclosed in square brackets
            if not (transcription.startswith('[') and transcription.endswith(']')):
                # Write the data to the CSV file
                csv_writer.writerow([time_stamp, transcription])
    print(f"CSV file has been saved at: {csv_file_path}")
    return csv_file_path

#### **Final** call: mp3 -> *csv*

In [74]:
def begin(file_path, audio_log_dir):
    wav_file_path = convert_to_wav(file_path, audio_log_dir)
    #################### Map The original path of the mp3 file to csv file ##############
    text_file_path = transcript_wav_totext_using_whisper(wav_file_path)
    csv_file_path = generate_transcription_csv(text_file_path)
    loc_map_dict[csv_file_path] = file_path
    print(f'file_path {file_path}')
    print(f'wav_file_path {wav_file_path}')
    print(f'text_file_path {text_file_path}\n')
    print(f"File {file_path} converted to csv {text_file_path}")

In [75]:

if mp3_train:
  print(root_directory)
  os.chdir(root_directory)
  !pwd
  audio_log_dir = root_directory+"/log_dir/audio_csv/"
  if not os.path.exists(audio_log_dir):
      print("Create audio log dir")
      os.makedirs('log_dir/audio_csv')

  else:
      print("path already exist",audio_log_dir)

  os.chdir('/content/whisper.cpp/')
  for mp3_file in all_files['mp3']:
      begin(mp3_file, audio_log_dir)
      break



In [76]:
loc_map_dict

{}

## Train Neural DB

> imports


```python
!pip3 install thirdai --upgrade
!pip3 install thirdai[neural_db]
!pip3 install langchain --upgrade
!pip3 install openai --upgrade
!pip3 install paper-qa --upgrade
```

In [77]:
!pip3 install thirdai --upgrade
!pip3 install thirdai[neural_db]
!pip3 install langchain --upgrade
!pip3 install openai --upgrade
!pip3 install paper-qa --upgrade



In [78]:
import pandas as pd
from thirdai import licensing, neural_db as ndb
licensing.deactivate()
licensing.activate("1FB7DD-CAC3EC-832A67-84208D-C4E39E-V3")

In [79]:
db = ndb.NeuralDB(user_id="siddhesh_saiKrishna")


In [80]:
os.chdir("/content")
os.getcwd()

'/content'

In [81]:
# Set up a cache directory
import os
if not os.path.isdir("bazaar_cache"):
    os.mkdir("bazaar_cache")
from pathlib import Path
from thirdai.neural_db import Bazaar
bazaar = Bazaar(cache_dir=Path("bazaar_cache"))

In [82]:
bazaar.fetch() # Optional arg filter="model name" to filter by model name.

In [83]:
print(bazaar.list_model_names())

['Contract Review', 'Finance QnA', 'General QnA']


In [84]:
db = bazaar.get_model("General QnA")

## Insert files to N-DB

In [85]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### PDFS

In [86]:
insertable_docs = []
pdf_files = all_files['pdf']

for file in pdf_files:
    try:
        pdf_doc = ndb.PDF(file)
        insertable_docs.append(pdf_doc)
        print(f'{file} done')
    except:
        pass

/content/drive/MyDrive/Personal Statement .pdf done
/content/drive/MyDrive/SOP.pdf done
/content/drive/MyDrive/Introduction to Algorithms-Cormen.pdf done
/content/drive/MyDrive/Software Engineering Rajib Mall.pdf done
/content/drive/MyDrive/UnderstandingDeepLearning_03_05_23_C.pdf done
/content/drive/MyDrive/Resume_2023.pdf done
/content/drive/MyDrive/DOC-20230804-WA0021.pdf done


#### docx

In [87]:
doc_files = all_files['docx']

for file in doc_files:
    try:
        doc = ndb.DOCX(file)
        insertable_docs.append(doc)
        print(f'{file} done')
    except:
        pass

/content/drive/MyDrive/22210036_FP602 (1).docx done
/content/drive/MyDrive/22210036_FP602.docx done
/content/drive/MyDrive/22210036_Assignment2.docx done
/content/drive/MyDrive/Abstract_22210036 (1).docx done
/content/drive/MyDrive/Abstract_22210036.docx done
/content/drive/MyDrive/sai krishna resume.docx done
/content/drive/MyDrive/LOR-Yashwanth(Chalapathi Rao) (1).docx done
/content/drive/MyDrive/LOR-Yashwanth(Chalapathi Rao).docx done
/content/drive/MyDrive/A.Y. 2021-2022.docx done
/content/drive/MyDrive/Doc1.docx done
/content/drive/MyDrive/Resume_2023.docx done


In [88]:
loc_map_dict.keys()
audio_log_dir


'/content/drive/MyDrive/log_dir/audio_csv/'

#### music

In [89]:
audio_files = [audio_log_dir+file for file in os.listdir(audio_log_dir) if file.endswith('.csv')]
audio_files

['/content/drive/MyDrive/log_dir/audio_csv/roadtoOz_24_baum_64kb.csv']

In [90]:
import pandas as pd
# Read the CSV file
for file in audio_files:
    df = pd.read_csv(file)
    # Add the new column with values from 0 to n-1
    df['DOC_ID'] = range(len(df))
    # Save the updated DataFrame back to the CSV file
    df.to_csv(file, index=False)


In [91]:
if mp3_train:
  for file in audio_files:
      try:
        csv_doc = ndb.CSV(
            path=file,
            id_column="DOC_ID",
            strong_columns=["transcription"],
            weak_columns=["time_stamp"],
            reference_columns=["time_stamp"])
        insertable_docs.append(csv_doc)
      except:
        pass

### Insert into database

In [92]:
source_ids = db.insert(insertable_docs, train=False)


# Search

In [96]:
query = input("Enter prompt: ")
search_results = db.search(
    # query="what is the termination period",
    query=query,
    top_k=3,
    on_error=lambda error_msg: print(f"Error! {error_msg}"))

for result in search_results:
    print(result.text)
    # print(result.context(radius=1))
    if loc_map_dict.get(result.source, None) == None:
        print(result.source)
    else:
        print(loc_map_dict.get(result.source, None))
    # print(result.metadata)
    print('************')

Enter prompt: sai krishna avula
Best, Sai Krishna
/content/drive/MyDrive/22210036_FP602 (1).docx
************
Best, Sai Krishna
/content/drive/MyDrive/22210036_FP602.docx
************
(C) MIT Press. B.4 Special types of matrix 445 to the origin when the matrix is applied. Determinants of matrix expressions obey the following rules: |AT | = |A| |AB| = |A||B| |A-1| = 1/|A|. (B.14) The trace of a square matrix is the sum of the diagonal values (the matrix itself need not be diagonal) or the sum of the eigenvalues.
/content/drive/MyDrive/UnderstandingDeepLearning_03_05_23_C.pdf
************
