In [13]:
# import libraries
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import CountVectorizer

import helper_Arabic

from datetime import datetime

In [2]:
# Read in a csv file and return a transformed dataframe
def numerical_dataframe(csv_file):    
    df = pd.read_csv(csv_file, index_col=0)
    
    df['Class'] = df['plagiarism_types'].map({'no-plagiarism':0,
                                      'artificial-obfuscation':1,
                                      'no-obfuscation':1,
                                      'simulated-obfuscation':1})
    
    df['category'] = df['plagiarism_types']
    
    df['category'] = df['category'].map({'no-plagiarism':0,
                                      'artificial-obfuscation':1,
                                      'no-obfuscation':2,
                                      'simulated-obfuscation':3})
    
    return df

In [3]:
# Calculate the ngram containment for one answer file/source file pair in a df
def calculate_containment(n, suspicious_text, source_text):    
    #count the n-grams
    counts = CountVectorizer(analyzer='word', ngram_range=(n,n))
    ngrams_array = counts.fit_transform([suspicious_text, source_text]).toarray()
    
    #containment calculation
    containment = (np.minimum(ngrams_array[0],ngrams_array[1]).sum())/(ngrams_array[0].sum())
    
    return containment

In [4]:
# Compute the normalized LCS given an suspicious text and a source text
def lcs_norm_word(suspicious_text, source_text):
    '''Computes the longest common subsequence of words in two texts; returns a normalized value.
       :param suspicious_text: The pre-processed text for an answer text
       :param source_text: The pre-processed text for an answer's associated source text
       :return: A normalized LCS value'''
    
    #split the strings into words using split() and whitespace as a separator
    suspicious_words = suspicious_text.split()
    source_words = source_text.split()
    
    #getting the word counts
    suspicious_w_counts = len(suspicious_words)
    source_w_counts = len(source_words)
    
    #instantiating a matrix adding extra row and column of zeros
    lcs_matrix = np.zeros((source_w_counts + 1, suspicious_w_counts + 1), dtype=int)
    
    #fillling up the matrix based on matches
    for s, s_word in enumerate(source_words, 1):
        for a, a_word in enumerate(suspicious_words, 1):
            if s_word == a_word:
                lcs_matrix[s][a] = lcs_matrix[s-1][a-1] + 1
            else:
                lcs_matrix[s][a] = max(lcs_matrix[s-1][a], lcs_matrix[s][a-1])
    
    lcs = lcs_matrix[source_w_counts][suspicious_w_counts]        
        
    return lcs/suspicious_w_counts

In [5]:
def len_text(text):
    str_text = str(text)
    len_text = len(str_text)
    
    return len_text

In [6]:
def num_words(text):
    return len(text.split())

In [7]:
def df_preprocess(df):
    
    col_name = 'suspicious_text'
    df = helper_Arabic.complete_preprocess(df, col_name)

    col_name = 'source_text'
    df = helper_Arabic.complete_preprocess(df, col_name)
    
    df['len_suspicious_text'] = df['suspicious_text'].apply(lambda x: len_text(x))
    df['len_source_text'] = df['source_text'].apply(lambda x: len_text(x))
    df['num_words_suspicious_text'] = df['suspicious_text'].apply(lambda x: num_words(x))
    df['num_words_source_text'] = df['source_text'].apply(lambda x: num_words(x))
    
    return df

In [8]:
def perform_task(df):
    
    df_temp = df.copy()
    df_temp.reset_index(drop=True, inplace=True)

    n_grams = range(1,8)

    containment_values = []
    lcs_values = []
    col_names = []

    for n_gram in n_grams:
        col_name = 'c_' + str(n_gram)
        col_names.append(col_name)

    col_names.append('lcs_value')

    df_features = pd.DataFrame(columns = col_names)

    for row_i in df_temp.index:
        suspicious_text = df_temp.iloc[row_i]['suspicious_text']
        source_text = df_temp.iloc[row_i]['source_text']
            
        features_values = []

        for n_gram in n_grams:
            containment_value = calculate_containment(n_gram, suspicious_text, source_text)
            features_values.append(containment_value)

        lcs_value = lcs_norm_word(suspicious_text, source_text)
        features_values.append(lcs_value)
        df_x = pd.DataFrame([features_values], columns = col_names)

        df_features = df_features.append(df_x, ignore_index = True)
        
    return df_features

In [9]:
def make_csv(x, y, file_name, file_dir):
    
    # in this concatenation the first column will be my labels, the remaining columns are features
    pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1).to_csv(os.path.join(file_dir, file_name), header=False, index=False)

    print('file created')

In [10]:
# train dataset
csv_file_train = 'data/train-pair-sentences.csv'

df_transformed_train = numerical_dataframe(csv_file_train)
df_transformed_train.drop(columns='obfuscation_types', axis=1, inplace=True)

df_transformed_train = df_preprocess(df_transformed_train)

df_features_train = perform_task(df_transformed_train)
selected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

train_x = df_features_train[selected_features].values
train_y = df_transformed_train['Class'].values

file_name_train = 'train.csv'
file_dir_train = 'data/'

make_csv(train_x, train_y, file_name_train, file_dir_train)

file created


In [11]:
train_x = df_features_train.values
train_y = df_transformed_train['Class'].values

file_name_train = 'train_all.csv'
file_dir_train = 'data/'

make_csv(train_x, train_y, file_name_train, file_dir_train)

file created


In [12]:
selected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

train_x = df_features_train[selected_features].values
train_y = df_transformed_train['category'].values

file_name_train = 'train_multi_class.csv'
file_dir_train = 'data/'

make_csv(train_x, train_y, file_name_train, file_dir_train)

file created


In [14]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

# test dataset
csv_file_test = 'data/test-pair-sentences.csv'

df_transformed_test = numerical_dataframe(csv_file_test)
df_transformed_test.drop(columns='obfuscation_types', axis=1, inplace=True)

df_transformed_test = df_preprocess(df_transformed_test)

df_features_test = perform_task(df_transformed_test)
selected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

test_x = df_features_test[selected_features].values
test_y = df_transformed_test['Class'].values

file_name_test = 'test.csv'
file_dir_test = 'data/'

make_csv(test_x, test_y, file_name_test, file_dir_test)
print()

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 11:45:23
file created

Current Time = 11:52:18


In [15]:
test_x = df_features_test.values
test_y = df_transformed_test['Class'].values

file_name_test_test = 'test_all.csv'
file_dir_test = 'data/'

make_csv(test_x, test_y, file_name_test, file_dir_test)

file created


In [16]:
selected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

test_x = df_features_test[selected_features].values
test_y = df_transformed_test['category'].values

file_name_test = '_test.csv'
file_dir_test = 'data/'

make_csv(test_x, test_y, file_name_test, file_dir_test)

file created
