In [21]:
import pandas as pd
import numpy as np
import numpy.linalg as LA # operasi baris elementer

from openpyxl import load_workbook # load data excel
from sklearn.feature_extraction.text import CountVectorizer # tf-idf
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer # tf-idf
from sklearn.metrics.pairwise import cosine_similarity # cosine similarity
from nltk.corpus import stopwords # preprocessing
from nltk.stem import PorterStemmer # preprocessing bahasa inggris
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory  # preprocessing
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #preprocessing
import string # ya buat string


stemmer = StemmerFactory().create_stemmer()  # Object stemmer
remover = StopWordRemoverFactory().create_stop_word_remover()  # objek stopword
translator = str.maketrans('', '', string.punctuation)

In [22]:
def stemmerEN(text):
    porter = PorterStemmer()
    stop = set(stopwords.words('english'))
    text = text.lower()
    text = [i for i in text.lower().split() if i not in stop]
    text = ' '.join(text)
    preprocessed_text = text.translate(translator)
    text_stem = porter.stem(preprocessed_text)
    return text_stem

def preprocess(text):
    text = text.lower()
    text_clean = remover.remove(text) #fungsi hapus stopword 
    text_stem = stemmer.stem(text_clean)
    text_stem = stemmerEN(text_stem)
    return text_stem

In [23]:
class Engine:
    def __init__(self):
        self.cosine_score = []
        self.train_set = []  # Documents
        self.test_set = []  # Query

    def addDocument(self, word): # fungsi untuk menambahkan dokumen dataset ke dalam list train_set
        self.train_set.append(word)

    def setQuery(self, word):  # fungsi untuk menambahkan data query ke dalam list test_Set
        self.test_set.append(word)

    def process_score(self):
        stopWords = stopwords.words('english') 
        vectorizer = CountVectorizer()

        transformer = TfidfTransformer()

        trainVectorizerArray = vectorizer.fit_transform(self.train_set).toarray() 
        # menghitung Bobot dokumen dataset dan uji dan kemudian disimpan dalam bentuk array 
        testVectorizerArray = vectorizer.transform(self.test_set).toarray()

        cx = lambda a, b: round(np.inner(a, b) / (LA.norm(a) * LA.norm(b)), 3) 
        #fungsi tanpa nama untuk normalisasi data dan definisi rumus Cosine Similarity 
        #         print testVectorizerArray
        output = []
        for i in range(0, len(testVectorizerArray)):
            output.append([])

        for vector in trainVectorizerArray:
            # print vector
            u = 0
            for testV in testVectorizerArray:
                #perhitungan Cosine Similarity dalam bentuk vector dari dataset dengan query
                #yang di masukan yang kemudian mengembalikan nilai cosine ke dalam variable
                #cosine_score dalam bentuk list.
                # print testV
                cosine = cx(vector, testV)
                #                 self.cosine_score.append(cosine)
                #                 bulatin = (round(cosine),2)
                output[u].append((cosine))
                u = u + 1
        return output
        # return testVectorizerArray

In [24]:
dataset = pd.read_excel("dataset-preprocessed.xlsx")
dataset

Unnamed: 0,Judul,Pembimbing,Penguji 1,Penguji 2,Preprocessing
0,Agen Crawler Alamat Email Menggunakan Metode B...,"Andri Pranolo, S.T., M.Cs.","Murinto, S.Si., M.Kom.","Ali Tarmuji, S.T., M.Cs.",agen crawler alamat email guna metode bread fi...
1,Alat Bantu Pembelajaran Baca Al Quran Berbasis...,"Eko Aribowo, S.T., M.Kom.","Sri Winiarti, S.T., M.Cs.","Murinto, S.Si., M.Kom.",alat bantu ajar baca al quran bas multimedia b...
2,Design dan Aplikasi Kompresi Audio Wave Besert...,"Eko Aribowo, S.T., M.Kom.","Drs. Wahyu Pujiono, M.Kom.","Sri Winiarti, S.T., M.Cs.",design aplikasi kompresi audio wave serta play...
3,Implementasi Media Pembelajaran Menggunakan Ma...,"Drs. Wahyu Pujiyono, M.Kom.","Murinto, S.Si., M.Kom.","Fiftin Noviyanto, S.T., M.Cs.",implementasi media ajar guna macromedia flash ...
4,Media Pembelajaran Teori Bahasa Automata pada ...,"Drs. Wahyu Pujiyono, M. Kom.","Drs. Tedy Setiadi, M.T.","Sri Handayaningsih, S.T., M.T.",media ajar teori bahasa automata materi push a...
5,Sistem Informasi Manajemen Pelaporan Sentra In...,"Eko Aribowo, S.T., M.Kom.","Drs. Tedy Setiadi, M.T.","Ali Tarmuji, S.T., M.Cs.",sistem informasi manajemen lapor sentra indust...
6,Sistem Pakar Menentukan Hama dan Penyakit Pada...,"Sri Winiarti, S.T., M.Cs.","Dr. Abdul Fadlil, M.T.","Nur Rochmah DPA, S.T., M.Kom.",sistem pakar tentu hama sakit tanam palawija
7,Sistem Pakar Pemanfaatan multimedia untuk Peny...,"Sri Winiarti, S.T., M.Cs.","Dr. Abdul Fadlil, M.T.","Ardi Pujiyanta, M.T",sistem pakar manfaat multimedia sakit tulang
8,Sistem Pendukung Keputusan untuk Memperkirakan...,"Sri Winiarti, S.T., M.Cs.","Dewi Soyusiawaty, S.T., M.T.","Ali Tarmuji, S.T., M.Cs.",sistem dukung putus kira sedia barang
9,Sistem Pendukung Keputusan untuk Memprediksi P...,"Ir. Ardi Pujiyanta, M.T.","Ardiansyah, S.T., M.Cs.","Murinto, S.Si., M.Kom.",sistem dukung putus prediksi ambil putus beri ...


In [51]:
queries = pd.read_excel("Queries.xlsx")
queries = queries['Queries'].values
queries

array(['alat bantu mengajar berbasis multimedia',
       'alat bantu mengajar berbasis web'], dtype=object)

In [53]:
engine = Engine()

docs = [str(x) for x in dataset['Preprocessing']]
documentNames = list()

for i, doc in enumerate(docs):
    engine.addDocument(doc) 
    documentNames.append("Document_{}".format(i+1))

for query in queries:
    engine.setQuery(query) #inputandata uji

titles_score = engine.process_score()
ScoreDf = (pd.DataFrame(titles_score)).T
ScoreDf.columns = queries
ScoreDf["Documents"] = documentNames
ScoreDf["Pembimbing"] = dataset["Pembimbing"].values
ScoreDf

Unnamed: 0,alat bantu mengajar berbasis multimedia,alat bantu mengajar berbasis web,Documents,Pembimbing
0,0.0,0.0,Document_1,"Andri Pranolo, S.T., M.Cs."
1,0.5,0.5,Document_2,"Eko Aribowo, S.T., M.Kom."
2,0.0,0.0,Document_3,"Eko Aribowo, S.T., M.Kom."
3,0.0,0.0,Document_4,"Drs. Wahyu Pujiyono, M.Kom."
4,0.167,0.0,Document_5,"Drs. Wahyu Pujiyono, M. Kom."
5,0.0,0.149,Document_6,"Eko Aribowo, S.T., M.Kom."
6,0.0,0.0,Document_7,"Sri Winiarti, S.T., M.Cs."
7,0.236,0.0,Document_8,"Sri Winiarti, S.T., M.Cs."
8,0.0,0.0,Document_9,"Sri Winiarti, S.T., M.Cs."
9,0.0,0.0,Document_10,"Ir. Ardi Pujiyanta, M.T."


In [54]:
df_listed = list()
for i in queries:
    labels = list()
    for j in ScoreDf[i]:
        if j>0.000:
            labels.append(1)
        else:
            labels.append(0)
    datadf = pd.DataFrame(ScoreDf[i])
    datadf['Documents'] = ScoreDf['Documents']
    datadf['Labels'] = labels
    datadf['Lectures'] = ScoreDf['Pembimbing'].values
    df_listed.append(datadf.sort_values(by=[i], ascending=False))
df_listed

[    alat bantu mengajar berbasis multimedia    Documents  Labels  \
 1                                     0.500   Document_2       1   
 7                                     0.236   Document_8       1   
 4                                     0.167   Document_5       1   
 0                                     0.000   Document_1       0   
 2                                     0.000   Document_3       0   
 3                                     0.000   Document_4       0   
 5                                     0.000   Document_6       0   
 6                                     0.000   Document_7       0   
 8                                     0.000   Document_9       0   
 9                                     0.000  Document_10       0   
 10                                    0.000  Document_11       0   
 11                                    0.000  Document_12       0   
 
                         Lectures  
 1      Eko Aribowo, S.T., M.Kom.  
 7      Sri Winiarti, S.T., M

In [55]:
df_listed[1].head(5)

Unnamed: 0,alat bantu mengajar berbasis web,Documents,Labels,Lectures
1,0.5,Document_2,1,"Eko Aribowo, S.T., M.Kom."
5,0.149,Document_6,1,"Eko Aribowo, S.T., M.Kom."
0,0.0,Document_1,0,"Andri Pranolo, S.T., M.Cs."
2,0.0,Document_3,0,"Eko Aribowo, S.T., M.Kom."
3,0.0,Document_4,0,"Drs. Wahyu Pujiyono, M.Kom."


In [None]:
"""
1. Kita simpen query di DB -> SQL Alchemy (No SQL)
2. Kita simpen Hasil dengan nama Query -> xlsx
3. Kita cek di SQL Alchemy cari query dengan nama query seperti inputan
4. Ketika ada langsung load -> query + ".xlsx"

"""