In [3]:
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.llms import OpenAI
import pinecone
import numpy as np
from langchain.vectorstores import Pinecone
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from langchain.document_loaders.csv_loader import CSVLoader

  from tqdm.autonotebook import tqdm


In [5]:
def read_pdf_data(pdf):
    text = ""
    pdf_reader = PdfReader(pdf)
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

In [6]:
def split_data(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    docs = text_splitter.split_text(text)
    docs_chunks =text_splitter.create_documents(docs)
    return docs_chunks

In [7]:
def create_embeddings_load_data():
    #embeddings = OpenAIEmbeddings()
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    
    return embeddings

In [15]:
def create_embeddings(df,embeddings):
    df[2] = df[0].apply(lambda x: embeddings.embed_query(x))
    return df

In [8]:
def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):

    pinecone.init(
    api_key=pinecone_apikey,
    environment=pinecone_environment
    )

    index_name = pinecone_index_name
    index = Pinecone.from_documents(docs, embeddings, index_name=index_name)

    return index


In [9]:
#Read dataset for model creation
def read_data(data):
    df = pd.read_csv(data,delimiter=',', header=None, encoding='utf-8')  
    return df

In [10]:
#Create embeddings instance
def get_embeddings():
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    return embeddings

In [20]:
#Splitting the data into train & test
def split_train_test__data(df_sample):
    # Split into training and testing sets
    sentences_train, sentences_test, labels_train, labels_test = train_test_split(
    list(df_sample[2]), list(df_sample[1]), test_size=0.25, random_state=0,shuffle=True)
    print(len(sentences_train))
    return sentences_train, sentences_test, labels_train, labels_test

In [21]:
#Get the accuracy score on test data
def get_score(svm_classifier,sentences_test,labels_test):
    score = svm_classifier.score(sentences_test, labels_test)
    return score

In [11]:
import streamlit as st

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import joblib

In [12]:
if 'cleaned_data' not in st.session_state:
    st.session_state['cleaned_data'] =''
if 'sentences_train' not in st.session_state:
    st.session_state['sentences_train'] =''
if 'sentences_test' not in st.session_state:
    st.session_state['sentences_test'] =''
if 'labels_train' not in st.session_state:
    st.session_state['labels_train'] =''
if 'labels_test' not in st.session_state:
    st.session_state['labels_test'] =''
if 'svm_classifier' not in st.session_state:
    st.session_state['svm_classifier'] =''



In [13]:
our_data=read_data('/Users/goncaavcicakmak/Desktop/pdf/clean_easa_faqs.csv')


In [18]:
embeddings=get_embeddings()
cleaned_data = create_embeddings(our_data,embeddings)

In [22]:
X_train,X_test,y_train,y_test= split_train_test__data(cleaned_data)

228


In [23]:
svm_classifier  = make_pipeline(StandardScaler(), SVC()) 

In [24]:
svm_classifier.fit(X_train,y_train)

In [25]:
accuracy_score=get_score(svm_classifier,X_test,y_test)



In [26]:
print(f"Validation accuracy is {100*accuracy_score}%!")

Validation accuracy is 70.12987012987013%!


In [27]:
joblib.dump(svm_classifier, 'modelsvm.pk1')

['modelsvm.pk1']