In [98]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
root_directory = '/content/drive/MyDrive'
import os


In [47]:
loc_map_dict = dict()

In [None]:
# Walk through all directories and subdirectories
from collections import defaultdict

all_files = defaultdict(list)

for dirpath, dirnames, filenames in os.walk(root_directory):
    print(f"Current directory: {dirpath}")

    # List the files in the current directory
    for filename in filenames:
        file_path = os.path.join(dirpath, filename)
        # print(f"File: {file_path}")
        file_type = file_path.split('.')[-1]
        all_files[file_type].append(file_path)


In [49]:
all_files.keys()

dict_keys(['pdf', 'docx', 'zip', 'gdoc', 'jpeg', 'ipynb', 'jpg', 'm', 'gslides', 'webp', 'xlsx', 'py', 'pptx', 'csv', 'gsheet', 'fig', 'pkl', 'mp3'])

In [50]:
for file_ext in ['pdf', 'mp3', 'docx']:
    print(f"file type: {file_ext} has {len(all_files[file_ext])} files")

file type: pdf has 80 files
file type: mp3 has 24 files
file type: docx has 14 files


### Extract the text from mp3 file

#### Gather the requirements

```python  

!git clone https://github.com/ggerganov/whisper.cpp.git  
%cd /content/whisper.cpp/models  
!bash download-ggml-model.sh base.en  
%cd ..  
!make   
```

```python  
!pip install pydub  
!apt install ffmpeg  
```

#### UTILS functions

> Imports

In [53]:
import os
from pydub import AudioSegment
import csv
import subprocess

> mp3 -> wav

In [54]:
def convert_to_wav(file_path, audio_log_dir):
    # Split the file path into base and extension
    base, ext = os.path.splitext(file_path)
    # Only process files that aren't already in wav format
    if ext.lower() != ".wav":
        # Load the audio file
        audio = AudioSegment.from_file(file_path)
        # Set frame rate to 16 kHz
        audio = audio.set_frame_rate(16000)
        # Save as wav
        wav_file_path = audio_log_dir + base.split('/')[-1] + ".wav"
        audio.export(wav_file_path, format="wav")
        print(f"Saved as {wav_file_path}")
    else:
        print(f"File is already in wav format: {file_path}")
    return wav_file_path

> wav -> text (using whisper)

In [63]:
def transcript_wav_totext_using_whisper(wav_file_path):
    base, ext = os.path.splitext(wav_file_path)
    command = f"./main -f '{wav_file_path}' >  {base}.txt"
    print(command)
    process = subprocess.run(command, shell=True, check=True)
    text_file_path = f'{base}.txt'
    return text_file_path

> text -> csv

In [64]:
def generate_transcription_csv(text_file_path):
    csv_file_path = text_file_path.replace('.txt', '.csv')
    with open(text_file_path, 'r') as txt_file, open(csv_file_path, 'w') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['time_stamp', 'transcription'])  # Writing header
        for line in txt_file:
            line = line.strip()
            # Skip empty lines
            if line == '':
                continue
            # Split the line into timestamp and transcription
            time_stamp, transcription = line.split(']', 1)
            # Remove the leading '[' from the timestamp
            time_stamp = time_stamp[1:]
            # Remove leading and trailing spaces from the transcription
            transcription = transcription.strip()
            # Skip lines where the transcription is enclosed in square brackets
            if not (transcription.startswith('[') and transcription.endswith(']')):
                # Write the data to the CSV file
                csv_writer.writerow([time_stamp, transcription])
    print(f"CSV file has been saved at: {csv_file_path}")

#### **Final** call: mp3 -> *csv*

In [65]:
def begin(file_path, audio_log_dir):
    wav_file_path = convert_to_wav(file_path, audio_log_dir)
    #################### Map The original path of the mp3 file to csv file ##############
    text_file_path = transcript_wav_totext_using_whisper(wav_file_path)
    generate_transcription_csv(text_file_path)
    loc_map_dict[text_file_path] = file_path
    print(f'file_path {file_path}')
    print(f'wav_file_path {wav_file_path}')
    print(f'text_file_path {text_file_path}\n')
    print(f"File {file_path} converted to csv {text_file_path}")

print(root_directory)
os.chdir(root_directory)
!pwd
audio_log_dir = root_directory+"/log_dir/audio_csv/"
if not os.path.exists(audio_log_dir):
    print("Create audio log dir")
    os.makedirs('log_dir/audio_csv')

else:
    print("path already exist",audio_log_dir)

os.chdir('/content/whisper.cpp/')
for mp3_file in all_files['mp3']:
    begin(mp3_file, audio_log_dir)
    break



/content/drive/MyDrive
/content/drive/MyDrive
path already exist /content/drive/MyDrive/log_dir/audio_csv/
Saved as /content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.wav
./main -f '/content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.wav' >  /content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.txt
CSV file has been saved at: /content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.csv
file_path /content/drive/MyDrive/thirdai_hack/road_to_oz_64kb_mp3/roadtoOz_18_baum_64kb.mp3
wav_file_path /content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.wav
text_file_path /content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.txt

File /content/drive/MyDrive/thirdai_hack/road_to_oz_64kb_mp3/roadtoOz_18_baum_64kb.mp3 converted to csv /content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.txt


In [75]:
loc_map_dict

{'/content/drive/MyDrive/log_dir/audio_csvroadtoOz_18_baum_64kb.wav': '/content/drive/MyDrive/thirdai_hack/road_to_oz_64kb_mp3/roadtoOz_18_baum_64kb.mp3',
 '/content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.wav': '/content/drive/MyDrive/thirdai_hack/road_to_oz_64kb_mp3/roadtoOz_18_baum_64kb.mp3'}

## Train Neural DB

> imports


```python
!pip3 install thirdai --upgrade
!pip3 install thirdai[neural_db]
!pip3 install langchain --upgrade
!pip3 install openai --upgrade
!pip3 install paper-qa --upgrade
```

In [67]:
import pandas as pd
from thirdai import licensing, neural_db as ndb
licensing.deactivate()
licensing.activate("1FB7DD-CAC3EC-832A67-84208D-C4E39E-V3")

In [None]:
db = ndb.NeuralDB(user_id="siddhesh_saiKrishna")


In [70]:
os.chdir("/content")
os.getcwd()

'/content'

In [71]:
# Set up a cache directory
import os
if not os.path.isdir("bazaar_cache"):
    os.mkdir("bazaar_cache")
from pathlib import Path
from thirdai.neural_db import Bazaar
bazaar = Bazaar(cache_dir=Path("bazaar_cache"))

In [72]:
bazaar.fetch() # Optional arg filter="model name" to filter by model name.

In [73]:
print(bazaar.list_model_names())

['Contract Review', 'Finance QnA', 'General QnA']


In [74]:
db = bazaar.get_model("General QnA")

## Insert files to N-DB

In [77]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### PDFS

In [92]:
insertable_docs = []
pdf_files = all_files['pdf']

for file in pdf_files:
    try:
        pdf_doc = ndb.PDF(file)
        insertable_docs.append(pdf_doc)
        print(f'{file} done')
    except:
        pass

/content/drive/MyDrive/22210045_A1.pdf done
/content/drive/MyDrive/22210045_AI.pdf done
/content/drive/MyDrive/22210045_expert.pdf done
/content/drive/MyDrive/22210045_general.pdf done
/content/drive/MyDrive/Edited - exam_1_siddhesh_22210045.pdf done
/content/drive/MyDrive/CCL Tennis Teams List - Sheet2.pdf done
/content/drive/MyDrive/PARAM Ananta User Guide 1.1.pdf done
/content/drive/MyDrive/Tut_1_SiddhiRahpurohit_.pdf done
/content/drive/MyDrive/Classroom/HS 201 World Civilisations Journal Writing/22210045_journal3.pdf done
/content/drive/MyDrive/Classroom/HS 201 World Civilisations Journal Writing/22210045_journal_4.pdf done
/content/drive/MyDrive/Classroom/Computational Neuroscience 2023/exam_1_siddhesh_22210045.pdf done
/content/drive/MyDrive/Classroom/Computational Neuroscience 2023/22210045_project_comp_neuro__Copy_ (2).pdf done
/content/drive/MyDrive/Classroom/Computational Neuroscience 2023/Edited - exam_1_siddhesh_22210045.pdf done
/content/drive/MyDrive/Classroom/Computatio

#### docx

In [93]:
doc_files = all_files['docx']

for file in doc_files:
    try:
        doc = ndb.DOCX(file)
        insertable_docs.append(doc)
        print(f'{file} done')
    except:
        pass

/content/drive/MyDrive/22210045_AI.docx done
/content/drive/MyDrive/22210045_Assignment1.docx done
/content/drive/MyDrive/22210045_A1.docx done
/content/drive/MyDrive/22210045_A2.docx done
/content/drive/MyDrive/22210045_Assignment_3 (1).docx done
/content/drive/MyDrive/22210045_Assignment_3.docx done
/content/drive/MyDrive/HSS_WCC_2 (1).docx done
/content/drive/MyDrive/HSS_WCC_2.docx done
/content/drive/MyDrive/Do's and Don't AC in hostels.docx done
/content/drive/MyDrive/Classroom/HS 201 World Civilisations Journal Writing/HSS_history.docx done
/content/drive/MyDrive/Classroom/HS 201 World Civilisations Journal Writing/HSS_WCC_2.docx done
/content/drive/MyDrive/Classroom/Plag sid 1/HSS_WCC_2.docx done
/content/drive/MyDrive/Classroom/check-suraj/HSS_WCC_2 (1).docx done
/content/drive/MyDrive/Classroom/check-suraj/HSS_WCC_2.docx done


#### music

In [94]:
audio_files = loc_map_dict.keys()

for file in audio_files:
    try:
        doc = ndb.DOCX(file)
        insertable_docs.append(doc)
        print(f'{file} done')
    except:
        print(f"could not insert {file}")

could not insert /content/drive/MyDrive/log_dir/audio_csvroadtoOz_18_baum_64kb.wav
could not insert /content/drive/MyDrive/log_dir/audio_csv/roadtoOz_18_baum_64kb.wav


#### urls

In [95]:
# valid_url_data = ndb.parsing_utils.recursive_url_scrape(base_url="https://www.thirdai.com/pocketllm/", max_crawl_depth=0)
# insertable_docs = []

# for url, response in valid_url_data:
#     try:
#         insertable_docs.append(ndb.URL(url, response))
#     except:
#         continue

In [96]:
source_ids = db.insert(insertable_docs, train=False)


## Search

In [97]:
query = input("Enter prompt: ")
search_results = db.search(
    # query="what is the termination period",
    query=query,
    top_k=3,
    on_error=lambda error_msg: print(f"Error! {error_msg}"))

for result in search_results:
    print(result.text)
    # print(result.context(radius=1))
    if loc_map_dict.get(result.source, None) == None:
        print(result.source)
    else:
        print(loc_map_dict.get(result.source, None))
    # print(result.metadata)
    print('************')

Enter prompt: Niemann. What do you do, then, when you are hoarse? I. Oh, I practise and see whether it still troubles me. Niem. Indeed; and what do you practise? I. Long, slow scales. Niem. Even if you are hoarse? I. Yes; if I want to sing, or have to, I try it. Niem. Well, what are they? Show me. The great scale, the infallible cure.
Thetrueartofsonghasalwaysbeenpossessedandwillalwaysbepossessedby suchindividualsasaredoweredbynaturewithallthatisneedfulforit--thatis healthyvocalorgans uninjuredbyvicioushabitsofspeech;agoodear atalent forsinging intelligence industry andenergy. Informertimeseightyearsweredevotedtothestudyofsinging--atthePrague Conservatory forinstance.Mostofthemistakesandmisunderstandingsofthe pupil could be discovered before he secured an engagement  and the teacher could spend so much time in correcting them that the pupil learned to pass judgmentonhimselfproperly.
/content/drive/MyDrive/thirdai_hack/books/How-to-Sing.pdf
************
1.E.7.Donotchargeafeeforaccessto 