# Plagiarism Detection Notebook
## Notebook for the "Textmining" project in WS2020/2021

Sources used for code: 

* https://radimrehurek.com/gensim/auto_examples/core/run_core_concepts.html

* https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#corpus-streaming-tutorial

In [70]:
# imports
import pprint
import gensim
from gensim import corpora
from gensim import models
from gensim import similarities
import nltk
from nltk.corpus import stopwords



In [19]:
# some example documents. For the actual application we wouldn't load everything at once.

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

## File Handling
Functions for opening files etc

## Preprocessing

Prepocess the text

In [51]:
# simple function for text preprocessing. Removes converts the text to lower case, removes stopwords and words with a minimum length of 2 and maximum length of 15
def preprocessing (corpus):
    
    processed_corpus = []
    
    # load stopwords from NLTK
    stop_words = set(stopwords.words('english'))
    
    # go through each document in the corpus
    for document in corpus:
        
        # step1: convert to lowercase and remove words that do not match the min-max-length
        step1 = gensim.utils.simple_preprocess(document, deacc=False, min_len=2, max_len=15)
        
        #step2: remove stopwords
        step2 = [word for word in step1 if word not in stop_words]
        
        processed_corpus.append(step2)
    return processed_corpus


In [52]:
processed_corpus = preprocessing(documents)
pprint.pprint(processed_corpus)

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]


Maybe add a filter for min occurence?

In [59]:
# convert text to vectors using the dictionary function

# define dictionary of our corpus. Contains the word frequency of each token in the whole corpus
dictionary = corpora.Dictionary(processed_corpus)

# transform the corpus to vectors. Each vector consists of a token ID and the token frequency (taken from the dictionary)
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
pprint.pprint(bow_corpus)


[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(2, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(4, 1), (10, 1), (12, 1), (13, 1), (14, 1)],
 [(3, 1), (10, 2), (13, 1), (15, 1), (16, 1)],
 [(8, 1), (11, 1), (12, 1), (17, 1), (18, 1), (19, 1), (20, 1)],
 [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)],
 [(24, 1), (26, 1), (27, 1), (28, 1)],
 [(24, 1), (26, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)],
 [(9, 1), (26, 1), (30, 1)]]


Create a TF-IDF model of the corpus:

In [72]:
# train the model
tfidf = models.TfidfModel(bow_corpus)

# transform the corpus
corpus_tfidf = tfidf[bow_corpus]


In [75]:
# every word is now represented by a vector: Token-ID and token-weight
for doc in corpus_tfidf:
    pprint.pprint(doc)

[(0, 0.4301019571350565),
 (1, 0.4301019571350565),
 (2, 0.2944198962221451),
 (3, 0.2944198962221451),
 (4, 0.2944198962221451),
 (5, 0.4301019571350565),
 (6, 0.4301019571350565)]
[(2, 0.3726494271826947),
 (7, 0.5443832091958983),
 (8, 0.3726494271826947),
 (9, 0.3726494271826947),
 (10, 0.27219160459794917),
 (11, 0.3726494271826947),
 (12, 0.27219160459794917)]
[(4, 0.438482464916089),
 (10, 0.32027755044706185),
 (12, 0.32027755044706185),
 (13, 0.438482464916089),
 (14, 0.6405551008941237)]
[(3, 0.3449874408519962),
 (10, 0.5039733231394895),
 (13, 0.3449874408519962),
 (15, 0.5039733231394895),
 (16, 0.5039733231394895)]
[(8, 0.30055933182961736),
 (11, 0.30055933182961736),
 (12, 0.21953536176370683),
 (17, 0.43907072352741366),
 (18, 0.43907072352741366),
 (19, 0.43907072352741366),
 (20, 0.43907072352741366)]
[(21, 0.48507125007266594),
 (22, 0.48507125007266594),
 (23, 0.48507125007266594),
 (24, 0.24253562503633297),
 (25, 0.48507125007266594)]
[(24, 0.31622776601683794),


## Compare Texts

In [76]:
# build an index for the corpus
index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=len(dictionary))

In [79]:
# add a query document
query_document = 'system engineering'.split()

# transform the query document to a vector
query_bow = dictionary.doc2bow(query_document)

# compare the query document to each document in the corpus
sims = index[tfidf[query_bow]]
print(list(enumerate(sims)))

[(0, 0.0), (1, 0.12172779), (2, 0.14323246), (3, 0.67615116), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]


In [78]:
for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
    print(document_number, score)

3 0.67615116
2 0.14323246
1 0.12172779
0 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
