Import Dependencies

In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Google Drive file paths

In [18]:
path = '/content/gdrive/My Drive/datasets/{}.parquet'
file_names = ['a']
dataframes = list()

Implement Preprocess function

In [19]:
def preprocess_data(sample_size):

  #Compile a list of dataframes
  for name in file_names:
    dataframes.append(pd.read_parquet(path.format(name)))

  #Vertically concatenate dataframes
  df = pd.concat(dataframes, axis=0)

  #Drop rows where text data is null
  df = df.dropna(subset=['text']).reset_index(drop=True)

  #Sample
  df = df.sample(sample_size)

  return df

# Preprocess data and sample 100 from it
preprocessed_data = preprocess_data(100)

#Shape
print("Shape:", preprocessed_data.shape)

#Head
preprocessed_data.head()


Shape: (100, 4)


Unnamed: 0,id,title,text,categories
234856,24824289,An-Nasir Muhammad (Zaidi imam),"An-Nasir Muhammad (January 17, 1680 - August 2...","[Zaydi imams of Yemen, 1754 deaths, 1680 birth..."
314109,6444001,Anything Is Possible (Darren Ockert album),Anything Is Possible is the first solo album b...,[2005 albums]
85853,38642496,Aerostar Airport Holdings,"Aerostar Airport Holdings, LLC is the public–p...","[Airports in Puerto Rico, Public–private partn..."
174879,72513124,Algérienne sauce,Algérienne sauce is a sweet and spicy sauce wi...,"[Hot sauces, Belgian sauces *[p.]: pages]"
232212,55696207,Amy H. Herring,Amy Helen Herring is an American biostatistici...,"[Year of birth missing (living people), Living..."


Install torch and transformers

In [20]:
pip -q install torch transformers

Load Bert Model

In [21]:
import torch
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, AutoModelForSequenceClassification

model_path = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(model_path, output_attentions=False, output_hidden_states=True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Create Vector from Text

In [22]:
def create_vector(text, MAX_LEN=510):

  #Generate Inmput ids for text
  input_ids = tokenizer.encode(text, add_special_tokens=True, max_length=MAX_LEN)

  #Pad sequences with more than the Maximum length of tokens
  results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

  #Remove the outer list
  input_ids = results[0]

  #Create attention mask
  attention_mask = [int(i>0) for i in input_ids]

  #Convert to tensors
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)

  #Add an extra redundant dimension for the batch
  input_ids=input_ids.unsqueeze(0)
  attention_mask=attention_mask.unsqueeze(0)

  #Put the model n evaluation mode, meaning a feed-forward operation
  model.eval()

  #Pass the text to BERT to get the vector embedding and collect all hidden states from the 12 encoders
  with torch.no_grad():
    logits, encoded_layers = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        return_dict=False,
        token_type_ids=None
    )

  #Extract the embedding
  vector = encoded_layers[-1][0][0]

  #Detach to cpu and convert to numpy array
  vector = vector.detach().cpu().numpy()

  return (vector)


Create Vector index

In [23]:
def create_vector_index(df):
  vectors = []

  #Get overall text data
  source_data = df.text.values
  print(source_data)

  for text in tqdm(source_data):
    vector = create_vector(text)
    vectors.append(vector)

  #Create a new column called vectors and reshape the array
  df['vectors'] = vectors
  df['vectors'] = df['vectors'].apply(lambda v: np.array(v).reshape(1, -1))

  return df

vector_index = create_vector_index(preprocessed_data)

["An-Nasir Muhammad (January 17, 1680 - August 23, 1754), was a Yemeni Sayyid who twice claimed the Zaidi imamate of Yemen, in 1723 and 1727-1729\\. Muhammad bin Ishaq was a grandson of Imam al-Mahdi Ahmad (died 1681). In 1723, while staying in Mashriq, he proclaimed his da'wah (call for the imamate) under the name an-Nasir Muhammad. The proclamation was done in opposition to the current Imam al-Mutawakkil al-Qasim. However, a well-known man of letters, Muhammad bin Isma'il al-Amir, managed to bring about a reconciliation. When al- Mutawakkil al-Qasim died in 1727, an-Nasir Muhammad once again claimed the imamate from his base in Zafar, north-west of San'a. He had the support of the Hashid and Bakil tribesmen, and from the Sayyid lord of Kawkaban.R.L. Playfair, A History of Arabia Felix or Yemen. Bombay 1959, p. 115. He was opposed by the deceased Imam's son al-Mansur al-Husayn II who held San'a. The leader of the tribesmen went to parley with al-Mansur al-Husayn but was assassinated i

  0%|          | 0/100 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 100/100 [03:12<00:00,  1.92s/it]


Analysis using cosine similarity

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

def prepare_for_cosine_similarity(text):
  vector = create_vector(text)
  vector = np.array(vector).reshape(1, -1);
  return vector

def is_plagiarism(similarity_score, plagiarism_threshold):
  return similarity_score > plagiarism_threshold

def analyse_plagiarism(query_text, plagiarism_threshold=0.8):
  query_vector = prepare_for_cosine_similarity(query_text)

  # Create a similarity column
  vector_index['similarity'] = vector_index['vectors'].apply(lambda x: cosine_similarity(query_vector, x))
  vector_index['similarity'] = vector_index['similarity'].apply(lambda x: x[0][0])

  #Get first 3 most similar articles
  similar_articles = vector_index.sort_values(by='similarity', ascending = False)[1:4]

  #Format columns
  formatted_similar_articles = similar_articles[['id', 'title', 'similarity']].reset_index(drop=True)

  #Similrity_score
  similarity_score = formatted_similar_articles.iloc[0]['similarity']

  #Most similar article
  most_similar_article_id = formatted_similar_articles.iloc[0]['id']
  most_similar_article_title = formatted_similar_articles.iloc[0]['title']

  plagiarism_result = {
      'similarity_score': similarity_score,
      'most_similar_article_id': most_similar_article_id,
      'most_similar_article_title': most_similar_article_title,
      'is_plagiarism': is_plagiarism(similarity_score, plagiarism_threshold),
      'article_submitted': query_text
  }

  return plagiarism_result



Testing

In [25]:
query_text = input('Enter submission to check for plagiarism:')
result = analyse_plagiarism(query_text)
print(result)


Enter submission to check for plagiarism:A 25–Year Celebration Tour was the twenty-fifth concert tour by Santana in 1991, celebrating their 25th anniversary as a band. == Tour band == * Alex Ligertwood – lead vocals, rhythm guitar (through April) * Tony Lindsay – lead vocals (beginning April) * Carlos Santana – lead guitar, percussion, vocals * Chester D. Thompson – keyboards * Benny Rietveld – bass guitar * Walfredo Reyes Jr. – drums (through April) * Gaylord Birch – drums (from April to June) * Billy Johnson – drums (beginning June) * Raul Rekow – congas, bongos, percussion, vocals * Karl Perazzo – timbales, percussion, vocals (beginning April) == Set list == The tour began on January 19 at the Rock in Rio II festival within the Maracanã Stadium in Rio de Janeiro, Brazil and ended on November 3 at a Bill Graham 
{'similarity_score': 0.8127157, 'most_similar_article_id': '19221057', 'most_similar_article_title': 'A Taste for Blood', 'is_plagiarism': True, 'article_submitted': 'A 25–Ye