## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import math

import nltk
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## Read The Data

In [2]:
data = pd.read_excel('korpusMRT.xlsx')
data

Unnamed: 0,Title,Content,Source
0,332.184 Orang Gunakan MRT Jakarta Pada Minggu ...,"Sejak beroperasi secara resmi pada Senin, 25 M...",https://www.jakartamrt.co.id/2019/03/29/332-18...
1,Ketua Wantimpres RI Apresiasi MRT Jakarta,Dalam kunjungannya ke Kantor Pusat PT MRT Jaka...,https://www.jakartamrt.co.id/2019/04/25/ketua-...
2,Masyarakat Mulai Gunakan MRT Jakarta ke Tempat...,"Dua hari setelah peresmiannya, masyarakat terl...",https://www.jakartamrt.co.id/2019/03/26/masyar...
3,MRT Jakarta Hadir di Tropis,Sebagai bagian dari upaya menyosialisasikan be...,https://www.jakartamrt.co.id/2019/03/28/mrt-ja...
4,MRT Jakarta Terapkan Perda DKI Jakarta Nomor 1...,Sebagai bagian dari upaya menjaga kebersihan d...,https://www.jakartamrt.co.id/2019/04/03/mrt-ja...
5,MRT Jakarta Terima Penghargaan K3 Kemnaker RI,Dalam acara penganugerahan “Penghargaan K3 201...,https://www.jakartamrt.co.id/2019/04/22/mrt-ja...
6,Rata-rata 78 ribu orang Per Hari Gunakan MRT J...,Hal tersebut disampaikan oleh Direktur Utama P...,https://www.jakartamrt.co.id/2019/04/12/rata-r...
7,"Selama April 2019, Tarif MRT Jakarta Dipotong ...",Operasi komersial MRT Jakarta telah dimulai pa...,https://www.jakartamrt.co.id/2019/04/01/selama...


## Preprocessing

1. Case Folding
2. Tokenization
3. Filtering
4. Stemming

In [3]:
def remove_stopwords(text):
    with open('stopwords.txt') as f:
        stopwords = f.readlines()
        stopwords = [x.strip() for x in stopwords]
    
    text = nltk.word_tokenize(text)
    text = [word for word in text if word not in stopwords]
                     
    return text

In [4]:
def stemming(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    
    result = [stemmer.stem(word) for word in text]
    
    return result

In [5]:
def preprocessing(text):
    #case folding
    text = text.lower()
    
    #remove urls
    text = re.sub('http\S+', '', text)
    
    #replace weird characters
    text = text.replace('“', '"')
    text = text.replace('”', '"')
    text = text.replace('-', ' ')
            
    #tokenization and remove stopwords
    text = remove_stopwords(text)
    
    #remove punctuation    
    text = [''.join(c for c in s if c not in string.punctuation) for s in text]    
    
    #stemming
    text = stemming(text)
    
    #remove empty string
    text = list(filter(None, text))
    
    return text

## Query:

In [6]:
q1 = "Tiket yang bisa digunakan adalah tiket harian"    
q2 = "Fasilitas di dalam stasiun juga sudah lengkap"
q3 = "MRT Jakarta selalu mengedepankan aspek keselamatan kerja"
q4 = "Selain menggunakan kereta, masyarakat juga dapat menikmati layanan gerai-gerai yang ada di stasiun"
q5 = "MRT Jakarta juga mengeluarkan dua jenis tiket, yaitu Jelajah Single Trip dan Jelajah Multi Trip"
query = [q1, q2, q3, q4, q5]

In [7]:
queries = pd.DataFrame()
for i, v in enumerate(query):
    cols = ["Query " + str(i+1)]
    query = pd.DataFrame.from_dict(nltk.FreqDist(preprocessing(v)), orient='index', columns=cols)
    queries = pd.concat([queries, query], axis=1, sort=False)

## Term Weighting:
- Term Frequency
- Inverse Document Frequency
- TF-IDF Weighting

## 1. Term Frequency

In [8]:
tf = pd.DataFrame()
for i,v in enumerate(data['Content']):    
    doc = pd.DataFrame.from_dict(nltk.FreqDist(preprocessing(v)), orient='index') 
    doc.columns = [data['Title'][i]]    
    tf = pd.concat([tf, doc], axis=1, sort=False)

In [9]:
tf.index.name = 'Term'
tf = pd.concat([tf, queries], axis=1, sort=False)
tf = tf.fillna(0)
tf

Unnamed: 0,332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,Ketua Wantimpres RI Apresiasi MRT Jakarta,Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,MRT Jakarta Hadir di Tropis,MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,MRT Jakarta Terima Penghargaan K3 Kemnaker RI,Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",Query 1,Query 2,Query 3,Query 4,Query 5
operasi,8.0,3.0,3.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0
resmi,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
senin,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
maret,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2019,4.0,1.0,0.0,2.0,0.0,4.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0
catat,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
kamis,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332184,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
tf[tf != 0] = 1 + np.log10(tf)
tf

  """Entry point for launching an IPython kernel.


Unnamed: 0,332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,Ketua Wantimpres RI Apresiasi MRT Jakarta,Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,MRT Jakarta Hadir di Tropis,MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,MRT Jakarta Terima Penghargaan K3 Kemnaker RI,Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",Query 1,Query 2,Query 3,Query 4,Query 5
operasi,1.903090,1.477121,1.477121,0.000000,0.000000,1.00000,1.000000,1.301030,0.0,0.0,0.0,0.0,0.00000
resmi,1.301030,0.000000,1.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000
senin,1.000000,0.000000,0.000000,0.000000,0.000000,1.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000
25,1.000000,1.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000
maret,1.301030,0.000000,0.000000,0.000000,0.000000,0.00000,1.000000,0.000000,0.0,0.0,0.0,0.0,0.00000
2019,1.602060,1.000000,0.000000,1.301030,0.000000,1.60206,1.698970,1.301030,0.0,0.0,0.0,0.0,0.00000
catat,1.301030,0.000000,0.000000,0.000000,0.000000,0.00000,1.000000,0.000000,0.0,0.0,0.0,0.0,0.00000
kamis,1.000000,1.000000,0.000000,1.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000
28,1.000000,0.000000,0.000000,1.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000
332184,1.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000


## 2. Inverse Document Frequency

In [11]:
train = tf.iloc[:,:len(data)]
test = tf.iloc[:,len(data):]

In [12]:
cols = train.columns
df = pd.DataFrame(train[cols].gt(0).sum(axis=1), columns=['Document Frequency'])

idf = np.log10(len(cols)/df)
idf.columns = ['Inverse Document Frequency']
idf = pd.concat([df, idf], axis=1)

In [13]:
idf

Unnamed: 0,Document Frequency,Inverse Document Frequency
operasi,6,0.124939
resmi,2,0.602060
senin,2,0.602060
25,2,0.602060
maret,2,0.602060
2019,6,0.124939
catat,2,0.602060
kamis,3,0.425969
28,2,0.602060
332184,1,0.903090


## 3. TF-IDF Weighting

In [14]:
tf_idf = tf.mul(idf['Inverse Document Frequency'], axis=0)
tf_idf.head()

Unnamed: 0,332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,Ketua Wantimpres RI Apresiasi MRT Jakarta,Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,MRT Jakarta Hadir di Tropis,MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,MRT Jakarta Terima Penghargaan K3 Kemnaker RI,Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",Query 1,Query 2,Query 3,Query 4,Query 5
operasi,0.23777,0.18455,0.18455,0.0,0.0,0.124939,0.124939,0.162549,0.0,0.0,0.0,0.0,0.0
resmi,0.783298,0.0,0.60206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
senin,0.60206,0.0,0.0,0.0,0.0,0.60206,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25,0.60206,0.60206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
maret,0.783298,0.0,0.0,0.0,0.0,0.0,0.60206,0.0,0.0,0.0,0.0,0.0,0.0


## Normalization

In [15]:
def normalization(x):    
    for i in x:        
        x[i] = x[i] / np.sqrt(np.sum(np.square(tf_idf[i])))        
    
    return x

In [16]:
tfidf_normalized = normalization(tf_idf)
tfidf_normalized

Unnamed: 0,332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,Ketua Wantimpres RI Apresiasi MRT Jakarta,Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,MRT Jakarta Hadir di Tropis,MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,MRT Jakarta Terima Penghargaan K3 Kemnaker RI,Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",Query 1,Query 2,Query 3,Query 4,Query 5
operasi,0.023362,0.023781,0.019053,0.000000,0.000000,0.010965,0.012407,0.024149,0.0,0.000000,0.0,0.000000,0.000000
resmi,0.076962,0.000000,0.062158,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
senin,0.059155,0.000000,0.000000,0.000000,0.000000,0.052839,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
25,0.059155,0.077582,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
maret,0.076962,0.000000,0.000000,0.000000,0.000000,0.000000,0.059786,0.000000,0.0,0.000000,0.0,0.000000,0.000000
2019,0.019666,0.016100,0.000000,0.019684,0.000000,0.017567,0.021079,0.024149,0.0,0.000000,0.0,0.000000,0.000000
catat,0.076962,0.000000,0.000000,0.000000,0.000000,0.000000,0.059786,0.000000,0.0,0.000000,0.0,0.000000,0.000000
kamis,0.041853,0.054891,0.000000,0.051583,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
28,0.059155,0.000000,0.000000,0.072907,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
332184,0.088732,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000


## Cosine Similarity

In [17]:
def cosine_similarity(a, b):
    return np.dot(a, b)

In [18]:
train = tfidf_normalized.iloc[:,:8]
test = tfidf_normalized.iloc[:,8:]

In [19]:
result = pd.DataFrame()
for i in test:
    cos_sim = []
    for j in train:        
#         print(cosine_similarity(test[i], train[j]))
        cos_sim.append(cosine_similarity(test[i], train[j]))
    
    df_cos_sim = pd.DataFrame(cos_sim, columns=[i])
    result = pd.concat([result, df_cos_sim], axis=1, sort=False)

Hasil cosine similarity dari tiap dokumen berdasarkan masing-masing query

In [20]:
result.set_index(train.columns, inplace=True)
result

Unnamed: 0,Query 1,Query 2,Query 3,Query 4,Query 5
332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,0.061293,0.000661,0.012203,0.031363,0.097289
Ketua Wantimpres RI Apresiasi MRT Jakarta,0.030756,0.000736,0.008674,0.0092,0.005848
Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,0.024641,0.000734,0.017215,0.009347,0.004685
MRT Jakarta Hadir di Tropis,0.0,0.121463,0.008151,0.008937,0.0
MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,0.023539,0.024073,0.0,0.045777,0.0
MRT Jakarta Terima Penghargaan K3 Kemnaker RI,0.0,0.0,0.191472,0.0,0.0
Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,0.023701,0.000334,0.011356,0.005555,0.0
"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",0.0957,0.02799,0.0,0.170189,0.394919


## Document Ranking

Ranking Document berdasarkan masing-masing query

In [21]:
pd.DataFrame(result['Query 1'].sort_values(ascending=False))

Unnamed: 0,Query 1
"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",0.0957
332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,0.061293
Ketua Wantimpres RI Apresiasi MRT Jakarta,0.030756
Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,0.024641
Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,0.023701
MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,0.023539
MRT Jakarta Terima Penghargaan K3 Kemnaker RI,0.0
MRT Jakarta Hadir di Tropis,0.0


In [22]:
pd.DataFrame(result['Query 2'].sort_values(ascending=False))

Unnamed: 0,Query 2
MRT Jakarta Hadir di Tropis,0.121463
"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",0.02799
MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,0.024073
Ketua Wantimpres RI Apresiasi MRT Jakarta,0.000736
Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,0.000734
332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,0.000661
Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,0.000334
MRT Jakarta Terima Penghargaan K3 Kemnaker RI,0.0


In [23]:
pd.DataFrame(result['Query 3'].sort_values(ascending=False))

Unnamed: 0,Query 3
MRT Jakarta Terima Penghargaan K3 Kemnaker RI,0.191472
Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,0.017215
332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,0.012203
Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,0.011356
Ketua Wantimpres RI Apresiasi MRT Jakarta,0.008674
MRT Jakarta Hadir di Tropis,0.008151
"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",0.0
MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,0.0


In [24]:
pd.DataFrame(result['Query 4'].sort_values(ascending=False))

Unnamed: 0,Query 4
"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",0.170189
MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,0.045777
332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,0.031363
Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,0.009347
Ketua Wantimpres RI Apresiasi MRT Jakarta,0.0092
MRT Jakarta Hadir di Tropis,0.008937
Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,0.005555
MRT Jakarta Terima Penghargaan K3 Kemnaker RI,0.0


In [25]:
pd.DataFrame(result['Query 5'].sort_values(ascending=False))

Unnamed: 0,Query 5
"Selama April 2019, Tarif MRT Jakarta Dipotong 50 Persen",0.394919
332.184 Orang Gunakan MRT Jakarta Pada Minggu Pertama Operasi,0.097289
Ketua Wantimpres RI Apresiasi MRT Jakarta,0.005848
Masyarakat Mulai Gunakan MRT Jakarta ke Tempat Kerja,0.004685
Rata-rata 78 ribu orang Per Hari Gunakan MRT Jakarta,0.0
MRT Jakarta Terima Penghargaan K3 Kemnaker RI,0.0
MRT Jakarta Terapkan Perda DKI Jakarta Nomor 13 Tahun 2013,0.0
MRT Jakarta Hadir di Tropis,0.0
