### Import Library yang akan digunakan

In [7]:
import pandas as pd
import numpy as np
import numpy.linalg as LA # operasi baris elementer

from openpyxl import load_workbook # load data excel
from sklearn.feature_extraction.text import CountVectorizer # tf-idf
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer # tf-idf
from sklearn.metrics.pairwise import cosine_similarity # cosine similarity
from nltk.corpus import stopwords # preprocessing
from nltk.stem import PorterStemmer # preprocessing bahasa inggris
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory  # preprocessing
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory #preprocessing


stemmer = StemmerFactory().create_stemmer()  # Object stemmer
remover = StopWordRemoverFactory().create_stop_word_remover()  # objek stopword

### Import Dataset yang akan digunakan

In [8]:
pd.set_option('display.max_colwidth', -1)
wb = load_workbook(filename='Daftar Skripsi.xlsx')

dataset = pd.DataFrame(wb['Data Skripsi'].values)
dataset.columns = ["Judul","Pembimbing","Penguji1","Penguji2"]
dataset = dataset.dropna(axis=0, how='any') # cleaning data
dataset

  warn(msg)


Unnamed: 0,Judul,Pembimbing,Penguji1,Penguji2
0,Agen Crawler Alamat Email Menggunakan Metode Bread First Crawling,"Andri Pranolo, S.T., M.Cs.","Murinto, S.Si., M.Kom.","Ali Tarmuji, S.T., M.Cs."
1,Alat Bantu Ajar Mata Kuliah Aljabar Linier dan Matriks pada Pokok Bahasan Determinan dan Invers Matrik,"Dewi Soyusiawaty, S.T., M.T.","Sri Handayaningsih, S.T., M.T.","Ir. Ardi Pujiyanta, M.T."
2,Alat Bantu Pembelajaran Baca Al Quran Berbasis Multimedia Berbasis Web,"Eko Aribowo, S.T., M.Kom.","Sri Winiarti, S.T., M.Cs.","Murinto, S.Si., M.Kom."
3,Alat Bantu Pembelajaran Mata Kuliah Computer Vision Pada Edge Based Segmentasi Citra Berbasis Multimedia,"Murinto, S.Si., M.Kom.","Dr. Abdul Fadlil, M.T.","Drs. Wahyu Pujiono, M.Kom."
4,Alat Bantu Riset Operasi Pada Materi Penugasan Berbasis Mutimedia,"Dr. Abdul Fadlil, M.T.","Drs. Tedy Setiadi, M.T.","Ir. Ardi Pujiyanta, M.T."
5,Alat Bantu ajar Mata kuliah Fuzzy logic pada pokok bahasan metode sugeno,"Ir. Ardi Pujiyanta, M.T.","Sri Winiarti, S.T., M.Cs.","Murinto, S.Si., M.Kom."
6,Algoritma Genetika untuk Menyelesaikan Travelling Sales Problem (TSP) Pada Pencarian Jalur Terpadu antar Kota,"Ir. Ardi Pujiyanta, M.T.","Murinto, S.Si., M.Kom.","Sri Winiarti, S.T., M.Cs."
7,Algoritma Levenshein Distance pada Commerce Distro Bravoisetees,"Rusydi Umar, S.T., M.T., Ph.D.","Drs. Tedy Setiadi, M.T.","Fiftin Noviyanto, S.T., M.Cs."
8,Analisa Dan Perancangan Sistem Pelangganan Berbasis Web Dengan Penerapan CRM S.A.,"Arfiani Nur Khusna, S.T., M.Kom.","Fiftin Noviyanto, S.T., M.Cs.","Rusydi Umar, S.T., M.T., Ph.D."
9,Analisa Perbandingan Teknik Hacking SQL Injection Pada Keamanan Web,"Dr. Imam Riyadi, M.Kom.","Eko Aribowo, S.T., M.Kom.","Herman Yuliansyah, S.T., M.Eng."


### Proses Pre Processing

In [9]:
def stemmerEN(text):
    porter = PorterStemmer()
    stop = set(stopwords.words('english'))
    text = text.lower()
    text = [i for i in text.lower().split() if i not in stop]
    text = ' '.join(text)
    preprocessed_text = text.translate(None, string.punctuation)
    text_stem = porter.stem(preprocessed_text)
    return text_stem

def preprocess(text):
    text = text.lower()
    text_clean = remover.remove(text) #fungsi hapus stopword 
    text_stem = stemmer.stem(text_clean)
    return text_stem

list_pre_judul = []
for data in dataset['Judul']:
    list_pre_judul.append(preprocess(data)) # memasukan data judul ke dalam list_pre_judul dan proses preprocessing
pre_judul = pd.DataFrame(list_pre_judul) 
pre_judul.columns = ["Judul"]
pre_judul

Unnamed: 0,Judul
0,agen crawler alamat email guna metode bread first crawling
1,alat bantu ajar mata kuliah aljabar linier matriks pokok bahas determinan invers matrik
2,alat bantu ajar baca al quran bas multimedia bas web
3,alat bantu ajar mata kuliah computer vision edge based segmentasi citra bas multimedia
4,alat bantu riset operasi materi tugas bas mutimedia
5,alat bantu ajar mata kuliah fuzzy logic pokok bahas metode sugeno
6,algoritma genetika selesai travelling sales problem tsp cari jalur padu antar kota
7,algoritma levenshein distance commerce distro bravoisetees
8,analisa ancang sistem langgan bas web terap crm s a
9,analisa banding teknik hacking sql injection aman web


### Proses Cosine Similarity

In [10]:
class Engine:
    def __init__(self):
        self.cosine_score = []
        self.train_set = []  # Documents
        self.test_set = []  # Query

    def addDocument(self, word): # fungsi untuk menambahkan dokumen dataset ke dalam list train_set
        self.train_set.append(word)

    def setQuery(self, word):  # fungsi untuk menambahkan data query ke dalam list test_Set
        self.test_set.append(word)

    def process_score(self):
        stopWords = stopwords.words('english') 
        vectorizer = CountVectorizer()

        transformer = TfidfTransformer()

        trainVectorizerArray = vectorizer.fit_transform(self.train_set).toarray() 
        # menghitung Bobot dokumen dataset dan uji dan kemudian disimpan dalam bentuk array 
        testVectorizerArray = vectorizer.transform(self.test_set).toarray()

        cx = lambda a, b: round(np.inner(a, b) / (LA.norm(a) * LA.norm(b)), 3) 
        #fungsi tanpa nama untuk normalisasi data dan definisi rumus Cosine Similarity 
        #         print testVectorizerArray
        output = []
        for i in range(0, len(testVectorizerArray)):
            output.append([])

        for vector in trainVectorizerArray:
            # print vector
            u = 0
            for testV in testVectorizerArray:
                #perhitungan Cosine Similarity dalam bentuk vector dari dataset dengan query
                #yang di masukan yang kemudian mengembalikan nilai cosine ke dalam variable
                #cosine_score dalam bentuk list.
                # print testV
                cosine = cx(vector, testV)
                #                 self.cosine_score.append(cosine)
                #                 bulatin = (round(cosine),2)
                output[u].append((cosine))
                u = u + 1
        return output
        # return testVectorizerArray

In [11]:
# Call Engine from Custom Libraries
engine = Engine()

# Define data uji
list_dokumen = [str(x) for x in pre_judul['Judul']]
list_datauji = ["rekomendasi pembimbing dan penguji skripsi menggunakan vector space model"]
columnNames = []

for i, doc in enumerate(list_dokumen):
    engine.addDocument(doc) 
    columnNames.append("Document_{}".format(i+1))
    
for doc in list_datauji:
    engine.setQuery(doc) #inputandata uji 
    
titles_score  = engine.process_score()
titlesScoreDf = pd.DataFrame(titles_score)

sort_data = titlesScoreDf.T
sort_data.columns    = ['Skor']
sort_data['Dokumen'] = columnNames

sort_data.sort_values(by=['Skor'], ascending=False)

Unnamed: 0,Skor,Dokumen
629,0.436,Document_630
283,0.369,Document_284
561,0.218,Document_562
581,0.204,Document_582
779,0.187,Document_780
529,0.183,Document_530
239,0.183,Document_240
451,0.174,Document_452
528,0.167,Document_529
537,0.167,Document_538


In [12]:
data = np.array(titles_score)

join_data = []
for i in range(0,len(data)):
    join_data.append({'Judul':dataset['Judul'], 'Pembimbing':dataset['Pembimbing'],'Penguji1':dataset['Penguji1'],'Penguji2':dataset['Penguji2'],'Skor':list(titles_score[i])})
    
join_data

[{'Judul': 0       Agen Crawler Alamat Email Menggunakan Metode Bread First Crawling                                                                                       
  1       Alat Bantu Ajar Mata Kuliah Aljabar Linier dan Matriks pada Pokok Bahasan Determinan dan Invers Matrik                                                  
  2       Alat Bantu Pembelajaran Baca Al Quran Berbasis Multimedia Berbasis Web                                                                                  
  3       Alat Bantu Pembelajaran Mata Kuliah Computer Vision Pada Edge Based Segmentasi Citra Berbasis Multimedia                                                
  4       Alat Bantu Riset Operasi Pada Materi Penugasan Berbasis Mutimedia                                                                                       
  5       Alat Bantu ajar Mata kuliah Fuzzy logic pada pokok bahasan metode sugeno                                                                                
  6       Alg