In [1]:
import ollama
import os
import json
import numpy as np
from numpy.linalg import norm
import PyPDF2
from docx import Document
import pandas as pd
import streamlit as st

# Membaca file TXT
def read_txt(filename):
    with open(filename, "r", encoding="utf-8-sig") as f:
        return f.read()

# Membaca file PDF
def read_pdf(filename):
    with open(filename, "rb") as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text

# Membaca file DOCX
def read_docx(filename):
    doc = Document(filename)
    text = "\n".join(paragraph.text for paragraph in doc.paragraphs)
    return text

# Membaca file CSV
# Membaca file CSV tanpa header dan mengisi NaN
def read_csv(filename):
    df = pd.read_csv(filename, header=None, delimiter=';', skip_blank_lines=True)
    df = df.fillna("0")  # Replace NaN with "0"
    text = ""
    
    # Loop through the rows
    for idx, row in df.iterrows():
        if idx >= 1:  # Ensure that idx is an integer
            text += f"'{idx}. " + ", ".join(str(value) for value in row.values) + "\n"
        else:
            # If it's the first row (header row), don't add an index number
            text += "Ini adalah data csv dengan delimter (,)\n" + ", ".join(str(value) for value in row.values) + "\n"
    
    return text

# Membaca file Excel (.xlsx)
def read_xlsx(filename):
    df = pd.read_excel(filename)
    text = ""
    for index, row in df.iterrows():
        text += " ".join(str(value) for value in row.values) + "\n"
    return text

# Mendapatkan paragraf dari semua file
def parse_file(filename):
    if filename.endswith(".txt"):
        content = read_txt(filename)
    elif filename.endswith(".pdf"):
        content = read_pdf(filename)
    elif filename.endswith(".docx"):
        content = read_docx(filename)
    elif filename.endswith(".csv"):
        content = read_csv(filename)
    elif filename.endswith(".xlsx"):  # Menambahkan pengecekan untuk file Excel
        content = read_xlsx(filename)
    else:
        raise ValueError(f"Unsupported file type: {filename}")
    
    paragraphs = []
    buffer = []
    for line in content.splitlines():
        line = line.strip()
        if line:
            buffer.append(line)
        elif len(buffer):
            paragraphs.append(" ".join(buffer))
            buffer = []
    if len(buffer):
        paragraphs.append(" ".join(buffer))
    return paragraphs

# Simpan embedding ke file
def save_embeddings(filename, embeddings):
    if not os.path.exists("embeddings"):
        os.makedirs("embeddings")
    with open(f"embeddings/{filename}.json", "w") as f:
        json.dump(embeddings, f)

# Memuat embedding dari file
def load_embeddings(filename):
    if not os.path.exists(f"embeddings/{filename}.json"):
        return False
    with open(f"embeddings/{filename}.json", "r") as f:
        return json.load(f)

# Mendapatkan embedding
def get_embeddings(filename, modelname, chunks):
    if (embeddings := load_embeddings(filename)) is not False:
        return embeddings
    embeddings = [
        ollama.embeddings(model=modelname, prompt=chunk)["embedding"]
        for chunk in chunks
    ]
    
    # Reduksi dimensi jika diperlukan (misalnya PCA)
    reduced_embeddings = reduce_embeddings_dimension(embeddings)
    
    save_embeddings(filename, reduced_embeddings)
    return reduced_embeddings

# Reduksi Dimensi dengan PCA (jika diperlukan)
def reduce_embeddings_dimension(embeddings):
    embeddings_array = np.array(embeddings)
    
    # Mengecek dimensi pertama
    if embeddings_array.shape[1] > 768:  # Jika dimensi lebih besar dari 768 (misalnya 4096)
        pca = PCA(n_components=768)  # Mengurangi dimensi ke 768
        embeddings_array = pca.fit_transform(embeddings_array)  # Reduksi dimensi
        print(f"Reduksi dimensi menjadi: {embeddings_array.shape}")
    
    return embeddings_array.tolist()  # Mengembalikan sebagai list jika diperlukan

# Cosine similarity untuk menemukan kemiripan
def find_most_similar(needle, haystack):
    needle_norm = norm(needle)
    similarity_scores = [
        np.dot(needle, item) / (needle_norm * norm(item)) for item in haystack
    ]
    return sorted(zip(similarity_scores, range(len(haystack))), reverse=True)

In [2]:
# Load subfolders in the `data` directory
data_root = "data/"
# subfolders = [f for f in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, f))]

# Streamlit select box to choose the folder
# selected_folder = st.selectbox("Pilih folder dataset:", subfolders)

# Load or process embeddings
data_folder = os.path.join(data_root, "CSV")
all_paragraphs = []
filenames = []

for file in os.listdir(data_folder):
    file_path = os.path.join(data_folder, file)
    if file.lower().endswith((".txt", ".pdf", ".docx", ".csv")):
        paragraphs = parse_file(file_path)
        all_paragraphs.extend(paragraphs)
        filenames.append(file)

In [3]:
print(all_paragraphs)

["Ini adalah data csv dengan delimter (,) Nama, total, diambil, sisa, cuti timbul, jumlah cuti bersama, jumlah cuti tahunan sebelumnya '1. Arselan Utama, 18, 3, 6, 15, 9, 3 '2. Windra Fitri Rahman, 15, 0, 6, 15, 9, 0 '3. Fatkhan Nugroho, 15, 0, 6, 15, 9, 0 '4. Ari Budiyanto, 15, 0, 6, 15, 9, 0 '5. Ridho Yuriansyah Putra, 19, 4, 6, 15, 9, 4 '6. Ferry Seloria, 21, 8, 4, 15, 9, 6 '7. Imaddudin Ishak Saifuddin, 15, 5, 1, 15, 9, 5 '8. Ramdani, 15, 0, 6, 15, 9, 0 '9. Roni Saputra, 15, 0, 6, 15, 9, 0 '10. Ahmad Taqyuddin, 15, 0, 6, 15, 9, 0 '11. Denis HP. Sarumpaet, 25, 8, 8, 21, 9, 4 '12. Rhisa Meidilla Sari, 15, 3, 3, 15, 9, 0 '13. Winda Angelina Lala, 15, 2, 4, 15, 9, 0 '14. Ega Nofiardi, 25, 9, 7, 16, 9, 9 '15. Akbar, 25, 5, 11, 24, 9, 1 '16. Andri Mardani, 25, 7, 9, 18, 9, 7 '17. Teddy Olgaraditya, 20, 7, 4, 15, 9, 5 '18. Mohammad Ichsan Andrian, 18, 6, 3, 15, 9, 3 '19. Muhammad Bahrul Ulum, 18, 3, 6, 15, 9, 3 '20. Hafiizh Septian Pristanto, 18, 3, 6, 15, 9, 3 '21. Mohamad Rosid, 15, 0, 

In [10]:
def main():
    SYSTEM_PROMPT = """You are an assistant that answers questions only in Bahasa Indonesia. 
    Your answers must be based solely on the provided context extracted from the documents. 
    If the answer cannot be determined from the context, respond with "Maaf, saya tidak tahu." 
    Do not include any information outside of the given context, and strictly reply in Bahasa Indonesia.

    Context:
    """

    # Load subfolders in the `data` directory
    # Load subfolders in the `data` directory
    data_root = "data/"
    # subfolders = [f for f in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, f))]
    
    # Streamlit select box to choose the folder
    # selected_folder = st.selectbox("Pilih folder dataset:", subfolders)
    
    # Load or process embeddings
    data_folder = os.path.join(data_root, "Test")
    all_paragraphs = []
    filenames = []
    
    for file in os.listdir(data_folder):
        file_path = os.path.join(data_folder, file)
        if file.lower().endswith((".txt", ".pdf", ".docx", ".csv")):
            paragraphs = parse_file(file_path)
            all_paragraphs.extend(paragraphs)
            filenames.append(file)


    folder_name = os.path.basename(data_folder)

    # Buat nama file embeddings berdasarkan nama folder
    embeddings_filename = f"data_embeddings_{folder_name}"
    # Buat embedding
    embeddings = get_embeddings(embeddings_filename, "nomic-embed-text", all_paragraphs)

    while True:
        prompt = input("Silakan tanya bosku? (ketik 'exit' untuk keluar) -> ")
        
        if prompt.lower() == "exit":
            print("Exiting the assistant. Goodbye!")
            break

        prompt_embedding = ollama.embeddings(model="nomic-embed-text", prompt=prompt)["embedding"]
        
        # Reduksi dimensi jika perlu pada prompt embedding (jika diperlukan)
        prompt_embedding = reduce_embeddings_dimension([prompt_embedding])[0]  # Reduksi menjadi 768 dimensi

        most_similar_chunks = find_most_similar(prompt_embedding, embeddings)[:10]

        response = ollama.chat(
            model="llama3",
            messages=[
                {
                    "role": "system",
                    "content": SYSTEM_PROMPT
                    + "\n".join(all_paragraphs[item[1]] for item in most_similar_chunks),
                },
                {"role": "user", "content": prompt},
            ],
        )
        
        print("\n\n")
        print(response["message"]["content"])


if __name__ == "__main__":
    main()

Silakan tanya bosku? (ketik 'exit' untuk keluar) ->  berapa cuti freenanda bolang?





Maaf, saya tidak tahu.


Silakan tanya bosku? (ketik 'exit' untuk keluar) ->  bagaimana dengan cuti Arselan Utama





Berdasarkan data cuti PINET, Arselan Utama memiliki total cuti 18 hari, diambil 3 hari, sisanya 15 hari, cuti timbul 9 hari, dan jumlah cuti bersama 3 hari.


Silakan tanya bosku? (ketik 'exit' untuk keluar) ->  bagaimana dengan cuti Fikri Rama





Menurut data karyawan PINET, Fikri Rama Singgih memiliki jumlah cuti sebesar 19 dengan diambil 4 dan sisa 15.


Silakan tanya bosku? (ketik 'exit' untuk keluar) ->  bagaimana dengan cuti Freenanda bolang





Menurut data, Freenanda Richard Bolang memiliki total cuti 19 hari, diambil 7 hari, dan sisanya 15 hari.


Silakan tanya bosku? (ketik 'exit' untuk keluar) ->  exit


Exiting the assistant. Goodbye!


In [149]:
import pandas as pd

def csv_to_txt(input_filename, output_filename):
    # Read the CSV file
    df = pd.read_csv(input_filename, header=0)  # Assumes first row is the header
    df = df.fillna("0")  # Replace NaN with "0"

    # Open the output .txt file for writing
    with open(output_filename, 'w', encoding="utf-8") as txt_file:
        # Write the header (column names)
        headers = ", ".join(df.columns)
        txt_file.write(f"{headers}\n")
        
        # Write the rows with index
        for idx, row in df.iterrows():
            row_data = ", ".join(str(value) for value in row.values)
            txt_file.write(f"{idx + 1}. {row_data}\n")
    
    print(f"CSV has been successfully converted to {output_filename}.")

# Example usage:
input_csv = "data\CSV\data_dummy.csv"  # Path to your CSV file
output_txt = "data\Test\output2.txt"  # Desired path for the output TXT file

csv_to_txt(input_csv, output_txt)


CSV has been successfully converted to data\Test\output2.txt.


In [148]:
import pandas as pd
import random
import faker

# Membuat instance dari Faker untuk menghasilkan data acak
fake = faker.Faker()

# Menyiapkan data dummy
data = []
for _ in range(50):
    nama = fake.name()
    gaji = random.randint(5000000, 15000000)  # Gaji acak antara 3 juta sampai 15 juta
    gender = random.choice(["Laki", "Perempuan", "Non Binary", "Others"])
    data.append([nama, gaji, jabatan])

# Membuat DataFrame dari data dummy
df = pd.DataFrame(data, columns=["Nama", "Gaji", "Jabatan"])

# Menyimpan ke CSV dengan delimiter ;
df.to_csv("data\CSV\data_dummy.csv", sep=";", index=False)

print("CSV dengan data dummy telah dibuat.")


CSV dengan data dummy telah dibuat.
