# PLAGIARISM_DETECTION

## author: GMD

### Import Library

In [1]:
import pandas as pd
import re
import os
import glob
import math

### Text Cleaner

In [2]:
def cleaner(text):
    #remove newline
    text = text.replace('\n', ' ')
    #remove multiple spaces
    text = re.sub(' +', ' ', text)
    #remove special characters and numbers
    text = re.sub('[^A-Za-z\- ]', '', text)
    
    return text.lower()

### Import dataset

In [3]:
documents = list()
#Import all .txt file in dataset folder
for filename in glob.glob(os.path.join('../dataset/', '*.txt')):
   with open(os.path.join(os.getcwd(), filename), 'r', encoding='mbcs') as f: # open in readonly mode
      documents.append(cleaner(f.read()))

In [4]:
###Kamus Indonesia
bahasa = open('../models/Indonesia.txt', 'r')
bahasa = bahasa.read().split('\n')

In [5]:
pd.DataFrame(bahasa)

Unnamed: 0,0
0,-an
1,-anda
2,-asi
3,-da
4,-el-
...,...
31435,zulu
31436,zurafah
31437,zuriah
31438,zuriat


### Hitung semua kata yang muncul

In [6]:
def computeAllWords(docs):
    all_words = set()
    for text in docs:
        for word in text.split():
            if word in bahasa:
                all_words.add(word)    
    return all_words

allWords = computeAllWords(documents)

In [7]:
pd.DataFrame(allWords)

Unnamed: 0,0
0,keliru
1,beku
2,saintis
3,kadang
4,cokelat
...,...
4159,robot
4160,instalasi
4161,terjang
4162,palu


### Hitung IDF

In [8]:
def computeIDF(allWords):
    N = len(documents)
    words_idf = dict()
    for word in allWords:
        words_idf[word] = 0

    for word in allWords:
        #Hitung nx
        nx = 0
        for doc in documents:
            if word in doc:
                nx += 1
        words_idf[word] = math.log((N + 1)/nx, 2)

    return words_idf

allWords = computeIDF(allWords)

In [None]:
pd.DataFrame(list(allWords.items()),columns = ['word','idf']) 

### Buat Data Frame

In [9]:
df = pd.DataFrame()
zeros = [0.0] * 30
for i in allWords.keys():
    df[i] = zeros

In [None]:
df

In [12]:
for i in range(len(df.index)):
    for word in allWords.keys():
        df['{}'.format(word)][i] = math.log10(documents[i].count(word) + 1) * allWords[word]

In [13]:
df

Unnamed: 0,keliru,beku,saintis,kadang,cokelat,basket,solo,para,asbak,nota,swasta,purnama,bawa,lumpur,nominasi,bumi,naskah,maulana,kam,prasejarah,saleh,natrium,aksi,teteh,feminin,membran,fenol,asal,tegal,permanen,katup,wakil,metal,produk,usus,tugas,selam,saya,menu,hormon,...,suvenir,sakit,residu,madani,agen,mampu,bongsor,enzim,lajang,tiba,ulah,sukses,bom,mentor,cakram,matriks,sake,sosis,jendela,saji,agama,dosen,detail,fluktuatif,lima,hanger,partisipasi,bulat,kerangka,debit,lakur,platform,cam,atas,jerap,robot,instalasi,terjang,palu,masak
0,0.0,0.0,0.0,0.152777,0.0,0.0,0.0,0.085443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.287242,2.028481,0.0,0.338285,0.0,0.0,0.0,0.146104,0.0,0.0,0.0,0.0,0.049264,0.0,0.0,0.0,0.8226,0.0,0.106342,0.067712,0.057927,0.045141,0.176169,0.102804,0.0,...,0.0,0.0,0.0,0.0,0.0,0.066936,0.0,0.0,0.0,0.0,0.0,0.148939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.336976,0.0,0.0,0.413537,0.0,0.076388,0.0,0.0,0.0,0.059998,0.0,0.0,0.0,0.042721,0.088701,0.0,1.491362,0.0,0.148939,0.0,0.792392
1,0.0,0.0,0.0,0.121073,0.0,0.0,0.0,0.067712,0.0,0.0,0.301669,0.0,0.067252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.067252,0.0,0.0,0.0,0.0,0.063504,0.0,0.0,0.0,0.0,0.0,0.119388,0.028481,0.081311,0.052696,0.258083,0.085761,0.0,...,0.0,0.0,0.0,0.0,0.630541,0.058207,0.0,0.0,0.0,0.588272,0.380663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.824361,0.0,0.0,0.152777,0.0,0.0,0.0,0.155092,0.0,0.0,0.0,0.036811,0.088425,0.0,0.0,0.0,0.297878,1.190332,0.792392
2,0.0,0.0,0.0,0.152777,0.0,0.0,2.064895,0.028481,0.0,0.0,0.777974,0.0,0.07487,0.0,0.0,0.0,0.0,0.0,0.169142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.049264,0.0,0.0,0.0,0.0,0.0,0.097122,0.042721,0.0,0.042721,0.312039,0.081952,0.0,...,0.0,0.0,0.0,0.0,0.0,0.022571,0.0,0.0,0.0,0.0,0.190332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11115,0.0,0.0,0.0,0.0,0.0,0.449969,0.0,0.059998,0.0,0.0,0.0,0.07055,0.064418,0.0,0.0,0.0,0.236063,0.0,0.0
3,0.0,0.0,0.0,0.197461,0.0,0.0,0.0,0.080778,0.0,0.0,0.0,0.0,0.081311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07487,0.0,0.0,0.0,0.0,0.054218,0.0,0.0,0.0,0.190332,0.0,0.106226,0.045141,0.057927,0.033065,0.0,0.075787,0.0,...,0.0,0.0,0.0,0.0,0.0,0.067712,0.0,0.0,0.0,0.0,0.301669,0.297878,0.0,0.0,0.0,1.247152,0.0,0.0,0.0,0.596866,0.0,0.0,0.260913,0.0,0.197461,0.0,0.0,0.0,0.059998,0.0,0.0,0.0,0.085761,0.077745,0.0,0.0,0.0,0.472126,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060492,0.0,0.0,0.301669,0.0,0.07487,0.0,0.889302,0.287242,0.0,0.0,0.0,0.0,0.0,0.0,0.086891,0.0,0.0,0.0,0.0,0.059382,0.0,0.0,0.0,0.0,0.0,0.08479,0.039978,0.146104,0.022571,0.312039,0.075266,0.0,...,0.0,0.851314,0.0,0.0,0.31527,0.060492,0.0,0.0,0.0,0.0,0.190332,0.236063,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11115,0.653291,0.521826,0.0,0.0,0.0,0.449969,0.0,0.119996,0.0,0.0,0.0,0.036811,0.082329,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.197461,0.0,0.0,0.0,0.079956,0.0,0.537119,0.301669,0.0,0.142122,0.0,0.0,0.0,0.0,1.130412,0.765126,0.0,0.0,0.0,0.091813,0.0,0.0,0.0,0.0,0.065292,0.0,0.0,0.0,0.190332,0.0,0.100315,0.060492,0.081311,0.054218,0.384517,0.081177,0.0,...,0.0,0.0,0.0,0.0,0.31527,0.072448,0.0,0.0,0.0,0.0,0.301669,0.297878,0.0,0.0,0.0,2.732574,0.0,0.0,0.0,0.0,0.11115,0.0,0.260913,0.0,0.242145,0.0,0.0,0.0,0.059998,0.0,0.0,0.0,0.08342,0.082329,0.0,0.0,0.0,0.148939,0.0,0.0
6,0.0,0.0,0.0,0.177368,0.0,0.0,0.0,0.082329,0.0,0.0,0.0,0.0,0.120776,0.0,0.0,0.287242,0.0,1.426421,0.392736,0.0,0.0,0.0,0.096215,1.491362,0.0,0.0,0.0,0.064418,1.01424,0.0,0.0,0.301669,0.0,0.095417,0.060492,0.127218,0.059382,0.0,0.081568,0.0,...,0.0,0.0,0.0,0.0,0.0,0.074185,0.0,0.0,0.0,0.0,0.0,0.385002,0.0,0.0,0.0,2.732574,0.0,0.0,0.0,0.637824,0.176169,0.0,0.260913,0.889302,0.177368,0.0,0.0,0.0,0.059998,0.0,0.0,0.0,0.083063,0.082329,0.0,0.0,0.0,0.148939,0.0,4.127928
7,0.0,0.0,0.0,0.177368,0.0,0.0,0.0,0.090783,0.0,0.0,0.0,0.0,0.146104,0.0,0.0,0.0,0.0,0.0,0.643985,0.0,0.0,0.0,0.096215,0.0,0.0,0.0,0.0,0.063504,0.0,0.0,0.0,0.0,0.0,0.085443,0.047306,0.176315,0.074733,0.0,0.08342,0.0,...,0.0,0.0,0.0,0.0,0.31527,0.08342,0.0,0.0,0.0,0.0,0.0,0.515244,0.0,1.491362,0.0,0.0,0.0,0.0,0.0,0.735503,0.176169,0.0,0.67445,0.0,0.229165,0.0,1.839232,0.0,0.139311,0.0,0.0,0.0,0.061546,0.066131,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.152777,1.01424,0.0,0.0,0.092674,0.0,0.0,0.301669,0.0,0.081311,0.889302,0.0,0.287242,0.0,0.0,0.507427,1.491362,1.491362,1.190332,0.100198,0.0,1.491362,0.0,0.0,0.063504,0.0,0.0,0.0,0.570995,0.0,0.115765,0.061546,0.045906,0.054218,0.28732,0.086988,0.0,...,0.0,0.0,0.0,1.491362,0.0,0.058207,0.0,0.0,0.0,0.0,0.301669,0.297878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.336976,0.11115,0.653291,0.732475,0.0,0.197461,0.0,0.0,0.0,0.119996,0.0,0.0,2.363752,0.062549,0.087284,0.0,0.0,0.0,0.297878,0.0,0.0
9,0.0,0.0,0.0,0.076388,0.0,0.0,0.0,0.062549,0.0,0.0,0.704311,0.0,0.100198,0.0,0.0,0.0,0.0,0.0,0.268084,0.0,0.0,0.0,0.110275,0.0,0.0,0.0,0.0,0.058207,0.0,0.0,0.0,0.301669,0.0,0.101229,0.022571,0.113158,0.045141,0.384517,0.08342,0.0,...,0.0,0.0,0.0,0.0,0.0,0.066936,0.0,0.0,0.0,0.588272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425216,0.0,0.824361,0.260913,0.0,0.152777,0.0,0.0,0.0,0.095094,0.0,0.0,0.0,0.028481,0.067712,0.0,0.0,0.0,0.148939,0.0,0.0


### Normalisasi

In [14]:
vector_length = list()
for row in df.index:
    v_len = 0
    for col in df.loc[row]:
        v_len += col
    vector_length.append(math.sqrt(v_len))

[25.47815826228121,
 20.764507179889556,
 17.90205548640491,
 21.245836323188467,
 20.075871897099876,
 25.40754920811007,
 28.181974147368827,
 23.110685724006597,
 28.875678739611264,
 17.529559185117037,
 23.07583733816295,
 20.674402528708146,
 29.23054530433784,
 31.189226421610492,
 27.66237537527083,
 25.473776285604036,
 33.63583023507045,
 34.360990522296596,
 27.836824203375244,
 24.598186469655424,
 28.95815718719993,
 19.463954323644426,
 20.34496841452018,
 24.848237578626705,
 24.47402815385024,
 22.039677149144044,
 19.312217994025293,
 29.43348275126478,
 30.40002221613974,
 18.438623154758623]