In [0]:
 #Feature Engineering


  # Clean and pre-process the text data.
  # Define features for comparing the similarity of an answer text and a source text, and extract similarity features.
  # Select "good" features, by analyzing the correlations between different features.
  # Create train/test .csv/.txt files that hold the relevant features and class labels for train/test data points.


In [0]:
# import libraries
import pandas as pd
import numpy as np
import os
import operator 
import re

In [0]:
#more libraries

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import spacy
import nltk
nltk.download('vader_lexicon')

Using TensorFlow backend.


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [0]:
# Have to mount my drive otherwise I cannot import data from drive
from google.colab import drive
drive.mount('/content/drive/')

In [0]:
csv_file = '/content/drive/My Drive/Colab Notebooks/CrossLanguagePlagirism/CombinedData.csv'
plagiarism_df = pd.read_csv(csv_file)

# print out the first few rows of data info
plagiarism_df.head(10)

#Convert Categorical Data to discrete 




*   0 = French
*  1= English


*   2 = Teng
*   -1 = OG English








Binary class label



*   0 = plagiarized
*   1 = not plagiarized


*   2 = partially plagiarized (might be for translated text)
*   Might want to keep things binary ie plagiarized v. not plagiarized







In [0]:
# Read in a csv file and return a transformed dataframe
def numerical_dataframe(csv_file='/content/drive/My Drive/Colab Notebooks/CrossLanguagePlagirism/CombinedData.csv'):
    '''Reads in a csv file which is assumed to have `File`, `Category` and `Task` columns.
       This function does two things: 
       1) converts `Category` column values to numerical values 
       2) Adds a new, numerical `Class` label column.
       The `Class` column will label plagiarized answers as 1 and non-plagiarized as 0.
       Source texts have a special label, -1.
       :param csv_file: The directory for the file_information.csv file
       :return: A dataframe with numerical categories and a new `Class` label column'''
    
    plagiarism_df = pd.read_csv(csv_file)
    
    labels = {
        "French":0,
        "English":1, # for our purposes are we considering manual translation plagiarism?
        "TEng":2,
        "OG_ENG_ESSAY":-1
    }
    plagiarism_df['Category'] = plagiarism_df.Label.map(labels)
    plagiarism_df['Class'] = plagiarism_df['Category'].apply(lambda x: 0 if x == 0 else (-1 if x==-1 else 1))
    return plagiarism_df

In [0]:
#informal testing, print out the results of a called function
# create new `transformed_df`
transformed_df = numerical_dataframe(csv_file ='/content/drive/My Drive/Colab Notebooks/CrossLanguagePlagirism/CombinedData.csv')

# check work
# check that all categories of plagiarism have a class label = 1
transformed_df.head(10)

# Helper Functions

I borrowed some of these helper functions from elsewhere

In [0]:
# Add 'datatype' column that indicates if the record is original wiki answer as 0, training data 1, test data 2, onto 
# the dataframe - uses stratified random sampling (with seed) to sample by task & plagiarism amount 

# Use function to label datatype for training 1 or test 2 
def create_datatype(df, train_value, test_value, datatype_var, compare_dfcolumn, operator_of_compare, value_of_compare,
                    sampling_number, sampling_seed):
    # Subsets dataframe by condition relating to statement built from:
    # 'compare_dfcolumn' 'operator_of_compare' 'value_of_compare'
    df_subset = df[operator_of_compare(df[compare_dfcolumn], value_of_compare)]
    df_subset = df_subset.drop(columns = [datatype_var])
    
    # Prints counts by task and compare_dfcolumn for subset df
    #print("\nCounts by Task & " + compare_dfcolumn + ":\n", df_subset.groupby(['Task', compare_dfcolumn]).size().reset_index(name="Counts") )
    
    # Sets all datatype to value for training for df_subset
    df_subset.loc[:, datatype_var] = train_value
    
    # Performs stratified random sample of subset dataframe to create new df with subset values 
    df_sampled = df_subset.groupby(['Category', compare_dfcolumn], group_keys=False).apply(lambda x: x.sample(min(len(x), sampling_number), random_state = sampling_seed))
    df_sampled = df_sampled.drop(columns = [datatype_var])
    # Sets all datatype to value for test_value for df_sampled
    df_sampled.loc[:, datatype_var] = test_value
    
    # Prints counts by compare_dfcolumn for selected sample
    #print("\nCounts by "+ compare_dfcolumn + ":\n", df_sampled.groupby([compare_dfcolumn]).size().reset_index(name="Counts") )
    #print("\nSampled DF:\n",df_sampled)
    
    # Labels all datatype_var column as train_value which will be overwritten to 
    # test_value in next for loop for all test cases chosen with stratified sample
    for index in df_sampled.index: 
        # Labels all datatype_var columns with test_value for straified test sample
        df_subset.loc[index, datatype_var] = test_value

    #print("\nSubset DF:\n",df_subset)
    # Adds test_value and train_value for all relevant data in main dataframe
    for index in df_subset.index:
        # Labels all datatype_var columns in df with train_value/test_value based upon 
        # stratified test sample and subset of df
        df.loc[index, datatype_var] = df_subset.loc[index, datatype_var]

    # returns nothing because dataframe df already altered 

In [0]:
def train_test_dataframe(clean_df, random_seed=100):
    
    new_df = clean_df.copy()

    # Initialize datatype as 0 initially for all records - after function 0 will remain only for original wiki answers
    new_df.loc[:,'Datatype'] = 0

    # Creates test & training datatypes for plagiarized answers (1,2,3)
    create_datatype(new_df, 1, 2, 'Datatype', 'Category', operator.gt, 0, 1, random_seed)

    # Creates test & training datatypes for NON-plagiarized answers (0)
    create_datatype(new_df, 1, 2, 'Datatype', 'Category', operator.eq, 0, 2, random_seed)
    
    # creating a dictionary of categorical:numerical mappings for plagiarsm categories
    mapping = {0:'orig', 1:'train', 2:'test'} 

    # traversing through dataframe and replacing categorical data
    new_df.Datatype = [mapping[item] for item in new_df.Datatype] 

    return new_df

In [0]:
# helper function for pre-processing text given a file
def process_file(file):
    # put text in all lower case letters 
    all_text = file.read().lower()

    # remove all non-alphanumeric chars
    all_text = re.sub(r"[^a-zA-Z0-9]", " ", all_text)
    # remove newlines/tabs, etc. so it's easier to match phrases, later
    all_text = re.sub(r"\t", " ", all_text)
    all_text = re.sub(r"\n", " ", all_text)
    all_text = re.sub("  ", " ", all_text)
    all_text = re.sub("   ", " ", all_text)
    
    return all_text

In [0]:
def create_text_column(df, file_directory='data/'):
    '''Reads in the files, listed in a df and returns that df with an additional column, `Text`. 
       :param df: A dataframe of file information including a column for `File`
       :param file_directory: the main directory where files are stored
       :return: A dataframe with processed text '''
   
    # create copy to modify
    text_df = df.copy()
    
    # store processed text
    text = []
    
    # for each file (row) in the df, read in the file 
    for row_i in df.index:
        filename = df.iloc[row_i]['File']
        #print(filename)
        file_path = file_directory + filename
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:

            # standardize text using helper function
            file_text = process_file(file)
            # append processed text to list
            text.append(file_text)
    
    # add column to the copied dataframe
    text_df['Text'] = text
    
    return text_df

# Preprocessing

In [0]:
text_df = transformed_df
text_df.head()

# Split into Training and Test Set

In [0]:
random_seed = 1  # can change; set for reproducibility

"""

"""


# create new df with Datatype (train, test, orig) column
# pass in `text_df` from above to create a complete dataframe, with all the information you need
complete_df = train_test_dataframe(text_df, random_seed=random_seed)

# check results
complete_df.head()


#Feature Engineering

List of plagirism features

1.   containment (https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c412841_developing-a-corpus-of-plagiarised-short-answers/developing-a-corpus-of-plagiarised-short-answers.pdf)
2.   lcs(Longest Common Subsequence)

3.   word embeddings(Elmo/Word2vec)
4.   Something for cross language similarity?(https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0126196)



5.   Vader( would hypothesis that a machine translation might lose some of the original sentiment that a manual translation might strive to capture)
6.   Maybe look into a Semantic feature for cross language and cross translation analysis 








# Containment implementation

In [0]:
#TODO I need to get it so that MT and and Manual translations compare. what I do is have the ENG and 
#TENG article IDS to match 
 
from sklearn.feature_extraction.text import CountVectorizer

def calculate_containment(df, n, answer_filename):

  def get_answer_and_source(df: pd.DataFrame, file_name:str) -> (str, str):
      #get category of the answered item
      task =df.loc[df['ID']== file_name, 'Category'].item()
      #task =df.loc[df['Article']== file_name, 'Category'].item()
      match = df[df['ID'].str.match(r'(\w{7}-\d*-\d*-(eng|teng))')==True]  # This matchs rows in article column that has 7 char word AKA article, and either teng or eng at the end
 
      answer_text = df.loc[df['ID']==file_name, 'Text'].item()
      # get the source text
      source_text =df.loc[(df['ID']== task)& (df['Category']== -1), 'Text'].item()
       
      return answer_text, source_text
# As long as article match but Category is different I should be able to get a match of source to answer

      #get both text

  answer, source = get_answer_and_source(df, answer_filename)

  vectorizer= CountVectorizer(analyzer='word', ngram_range=(n,n))
  result = vectorizer.fit_transform([answer, source])
  result_np = result.toarray()
  intersection_np= np.amin(result_np,axis=0)

  return intersection_np.sum()/result_np[0].sum()



In [0]:
n= 3

test_indices = range(5)

category_vals = []

containment_vals = []

for i in test_indices:
  category_vals.append(complete_df.loc[i, 'Category'])

  filename = complete_df.loc[i, 'ID']
  c = calculate_containment(complete_df,n,filename)
  containment_vals.append(c)

print ('Óriginal category values: \n', category_vals)
print()
print(str(n)+ '-gram containment values: \n', containment_vals)

# Vader implementation 

In [0]:
#TODO finish fixing 

#Sentiment Analyzer VADER
def nltk_sentiment(sentence):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

    nltk_sentiment = SentimentIntensityAnalyzer()
    score = nltk_sentiment.polarity_scores(sentence)
    return score

nltk_results = [nltk_sentiment(row) for row in plagiarism_df["Text"]]
results_df = pandas.DataFrame(nltk_results)
#plagiarism_df = plagiarism_df.join(results_df)

plagiarism_df['neg']= plagiarism_df['Text'].apply(lambda x:nltk_results['neg'])
plagiarism_df['pos']= plagiarism_df['Text'].apply(lambda x:nltk_results['pos'])
plagiarism_df['compound']= plagiarism_df['Text'].apply(lambda x: nltk_results['compound'])
plagiarism_df['neu']= plagiarism_df['Text'].apply(lambda x: nltk_results['neu'])

# Longest Common Subsequence

--- The longest common subsequence is the longest string of words or letters that are the same between Source text and answer text. This values is normalized by dividng by the total number of words or letters in the answer text



In [0]:
def lcs_norm_word(answer_text, source_text):

  answer_split = answer_text.split()
  source_split = source_text.split()

  row = len(answer_split)
  col = len(source_split)

  matrix = np.zeros((row+1, col +1))

  i = 1

  for answer in answer_split:
    j = 1

    for source in source_split:
      if answer==source:
        matrix[i][j] = matrix[i - 1][j -1]+ 1

      else:
        matrix[i][j] = max(matrix[i][j -1], matrix[i - 1][j])
      j = j +1
    i = i + 1
  return matrix[row][col]/ row


In [0]:
A = "i think pagerank is a link analysis algorithm used by google that uses a system of weights attached to each element of a hyperlinked set of documents"
S = "pagerank is a link analysis algorithm used by the google internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents"

lcs = lcs_norm_word(A, S)
print ('LCS = ', lcs)

In [0]:
test_indices = range(5)


category_vals = []
lcs_norm_vals = []

for i in test_indices:
  category_vals.append(complete_df.loc[i, 'Category'])
  #get texts to compare

  answer_text = complete_df.loc[i, 'Text']
  task = complete_df.loc[i, 'Task']

  orig_rows = complete_df[(complete_df['Class'] == -1)]
  orig_row = orig_rows[(orig_rows['Task'] == task)]
  source_text = orig_row['Text'].values[0]


  ## Calculate lcs

  lcs_val = lcs_norm_word(answer_text, source_text)
  lcs_norm_vals.append(lcs_val)


print ('Original category values: \n', category_vals)
print()
print('Normalized LCS values: \n', lcs_norm_vals)

# Cosine Similarity 

# Other features

In [0]:
#### Be creative with features --- these are just examples ##############       
    
#Word Count of the documents – total number of words in the documents
#Character Count of the documents – total number of characters in the documents
#Average Word Density of the documents – average length of the words used in the documents
#Puncutation Count in the Complete Essay – total number of punctuation marks in the documents
#Upper Case Count in the Complete Essay – total number of upper count words in the documents
#Title Word Count in the Complete Essay – total number of proper case (title) words in the documents
#Frequency distribution of Part of Speech Tags:
#Noun Count
#Verb Count
#Adjective Count
#Adverb Count
#Pronoun Count
        
    
        
# Here you can play with features -- add new ones; edit, remove.
trainDF['char_count'] = trainDF['text'].apply(len)
trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))


pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}
from textblob import TextBlob
# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

trainDF['noun_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['text'].apply(lambda x: check_pos_tag(x, 'pron'))


# Create all features

In [0]:
# Instead of creating another function I could also probs do 
#trainDF['containment'] = trainDF['text'].apply(lambda x: lcs_norm_word(answer_text, source_text)) 
# needs some work. For now I will creat function 

def create_containment_features(df,n, column_name=None):
  containment_values= []
  if(column_name = None):
    column_name = 'c_'+str(n) # c_1, c_2

  for i in df.index:
    file = df.loc[i, 'ID']
    #Computes features using calculate_contaiment function
    if df.loc[i,'Category']> -1:
      c = calculate_containment(df, n, file)
      containment_values.append(-1)

  print(str(n)+'-gram containment features created')
  return containment_values


In [0]:
# Function creates lcs feature and add it to the dataframe
def create_lcs_features(df, column_name='lcs_word'):
    
    lcs_values = []
    
    # iterate through files in dataframe
    for i in df.index:
        # Computes LCS_norm words feature using function above for answer tasks
        if df.loc[i,'Category'] > -1:
            # get texts to compare
            answer_text = df.loc[i, 'Text'] 
            task = df.loc[i, 'Task']
            # we know that source texts have Class = -1
            orig_rows = df[(df['Class'] == -1)]
            orig_row = orig_rows[(orig_rows['Task'] == task)]
            source_text = orig_row['Text'].values[0]

            # calculate lcs
            lcs = lcs_norm_word(answer_text, source_text)
            lcs_values.append(lcs)
        # Sets to -1 for original tasks 
        else:
            lcs_values.append(-1)

    print('LCS features created!')
    return lcs_values

# Append Features

In [0]:
# Define an ngram range
ngram_range = range(1,7)


# The following code may take a minute to run, depending on your ngram_range
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
features_list = []

# Create features in a features_df
all_features = np.zeros((len(ngram_range)+1, len(complete_df)))

# Calculate features for containment for ngrams in range
i=0
for n in ngram_range:
    column_name = 'c_'+str(n)
    features_list.append(column_name)
    # create containment features
    all_features[i]=np.squeeze(create_containment_features(complete_df, n))
    i+=1

# Calculate features for LCS_Norm Words 
features_list.append('lcs_word')
all_features[i]= np.squeeze(create_lcs_features(complete_df))

# create a features dataframe
features_df = pd.DataFrame(np.transpose(all_features), columns=features_list)

# Print all features/columns
print()
print('Features: ', features_list)
print()
1-gram con