In [None]:
!pip install langchain
!pip install torch
!pip install sentence_transformers
!pip install faiss-gpu
!pip install huggingface-hub
!pip install pypdf
!pip -q install accelerate
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
!pip -q install git+https://github.com/huggingface/transformers


In [None]:
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
import torch

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cuda")

In [None]:
loader = PyPDFLoader("/kaggle/input/llmdataset/data.pdf")
data = loader.load()

In [None]:
# print(data)

In [None]:
#Step 05: Split the Extracted Data into Text Chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=20)

text_chunks = text_splitter.split_documents(data)


In [None]:
#Step 06:Downlaod the Embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2",model_kwargs={'device': 'cuda'})


In [None]:
torch.cuda.set_device(torch.device("cuda:0"))
torch.cuda.empty_cache()

In [None]:
#Step 08: Create Embeddings for each of the Text Chunk
vector_store = FAISS.from_documents(text_chunks, embedding=embeddings)

In [None]:
!conda install -y gdown


In [None]:
!gdown --id 1-0OrbMiJffk2fwu31mCgef68R4KwsIeB

In [None]:
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512
#Import Model
llm = LlamaCpp(
    streaming = False,
    model_path="/kaggle/working/mistral-7b-instruct-v0.1.Q4_K_S.gguf",
    temperature=0.75,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    top_p=1,
    verbose=True,
    n_ctx=4096
)

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever(search_kwargs={"k": 2}))

In [None]:
query = "give me abstract of this research paper."
abstract=qa.run(query)


In [None]:
query = "give me introduction of this research paper."
introduction=qa.run(query)

In [None]:
query = "give me methodology used of this research paper."
methodology_used=qa.run(query)

In [None]:
query = "give me results of this research paper."
results= qa.run(query)

In [None]:
query = "give me conclusion of this research paper."
conclusion=qa.run(query)

In [None]:
query = "give me future scope of this research paper."
future_scope=qa.run(query)

In [None]:
mytext = "ABSTRACT: "+abstract.strip()+"INTRODUCTION: "+introduction.strip()+"METHODOLOGY USED: "+methodology_used.strip()+"RESULTS: "+results.strip()+"CONCLUSION: "+conclusion.strip()+"FUTURE SCOPE:"+future_scope.strip()


In [None]:
!pip install git+https://github.com/suno-ai/bark.git

In [None]:
from bark import SAMPLE_RATE, generate_audio, preload_models
from IPython.display import Audio

preload_models()

In [None]:
def split_text(text, max_length):
    words = text.split()
    result = []
    current_string = ""

    for word in words:
        if len(current_string + word) <= max_length:
            current_string += word + " "
        else:
            result.append(current_string.strip())
            current_string = word + " "

    if current_string:
        result.append(current_string.strip())

    return result

# Example usage
text = mytext
max_length = 130
result = split_text(text, max_length)


In [None]:
audio_array_results=[]
for i in result:
  audio_array = generate_audio(i, history_prompt="v2/en_speaker_6")
  audio_array_results.append(audio_array)


In [None]:
import numpy as np

In [None]:
output= np.concatenate(audio_array_results)
Audio(output, rate=SAMPLE_RATE)

In [None]:
audio_array_results_fem=[]
for i in result:
  audio_array = generate_audio(i, history_prompt="v2/en_speaker_9")
  audio_array_results_fem.append(audio_array)


In [None]:
output= np.concatenate(audio_array_results_fem)
Audio(output, rate=SAMPLE_RATE)

In [None]:
from scipy.io.wavfile import write


In [None]:
write("output.wav", SAMPLE_RATE, output)