In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import CountVectorizer

import helper_Arabic

from datetime import datetime

In [2]:
# Read in a csv file and return a transformed dataframe
def numerical_dataframe(csv_file):    
    df = pd.read_csv(csv_file, index_col=0)
    
    df['Class'] = df['plagiarism_types'].map({'no-plagiarism':0,
                                      'artificial-obfuscation':1,
                                      'no-obfuscation':1,
                                      'simulated-obfuscation':1})
    
    df['category'] = df['plagiarism_types']
    
    df['category'] = df['category'].map({'no-plagiarism':0,
                                      'artificial-obfuscation':1,
                                      'no-obfuscation':2,
                                      'simulated-obfuscation':3})
    
    return df

In [3]:
# Calculate the ngram containment for one answer file/source file pair in a df
def calculate_containment(n, suspicious_text, source_text):    
    #count the n-grams
    counts = CountVectorizer(analyzer='word', ngram_range=(n,n))
    ngrams_array = counts.fit_transform([suspicious_text, source_text]).toarray()
    
    #containment calculation
    containment = (np.minimum(ngrams_array[0],ngrams_array[1]).sum())/(ngrams_array[0].sum())
    
    return containment

In [4]:
# Compute the normalized LCS given an suspicious text and a source text
def lcs_norm_word(suspicious_text, source_text):
    '''Computes the longest common subsequence of words in two texts; returns a normalized value.
       :param suspicious_text: The pre-processed text for an answer text
       :param source_text: The pre-processed text for an answer's associated source text
       :return: A normalized LCS value'''
    
    #split the strings into words using split() and whitespace as a separator
    suspicious_words = suspicious_text.split()
    source_words = source_text.split()
    
    #getting the word counts
    suspicious_w_counts = len(suspicious_words)
    source_w_counts = len(source_words)
    
    #instantiating a matrix adding extra row and column of zeros
    lcs_matrix = np.zeros((source_w_counts + 1, suspicious_w_counts + 1), dtype=int)
    
    #fillling up the matrix based on matches
    for s, s_word in enumerate(source_words, 1):
        for a, a_word in enumerate(suspicious_words, 1):
            if s_word == a_word:
                lcs_matrix[s][a] = lcs_matrix[s-1][a-1] + 1
            else:
                lcs_matrix[s][a] = max(lcs_matrix[s-1][a], lcs_matrix[s][a-1])
    
    lcs = lcs_matrix[source_w_counts][suspicious_w_counts]        
        
    return lcs/suspicious_w_counts

In [5]:
def len_text(text):
    str_text = str(text)
    len_text = len(str_text)
    
    return len_text

In [6]:
def num_words(text):
    return len(text.split())

In [7]:
def df_preprocess(df):
    
    col_name = 'suspicious_file_text'
    df = helper_Arabic.complete_preprocess(df, col_name)

    col_name = 'source_file_text'
    df = helper_Arabic.complete_preprocess(df, col_name)
    
    df['len_suspicious_text'] = df['suspicious_file_text'].apply(lambda x: len_text(x))
    df['len_source_text'] = df['source_file_text'].apply(lambda x: len_text(x))
    df['num_words_suspicious_text'] = df['suspicious_file_text'].apply(lambda x: num_words(x))
    df['num_words_source_text'] = df['source_file_text'].apply(lambda x: num_words(x))
    
    return df

In [8]:
def perform_task(df):
    
    df_temp = df.copy()
    df_temp.reset_index(drop=True, inplace=True)

    n_grams = range(1,8)

    containment_values = []
    lcs_values = []
    col_names = []

    for n_gram in n_grams:
        col_name = 'c_' + str(n_gram)
        col_names.append(col_name)

    col_names.append('lcs_value')

    df_features = pd.DataFrame(columns = col_names)

    for row_i in df_temp.index:
        suspicious_text = df_temp.iloc[row_i]['suspicious_file_text']
        source_text = df_temp.iloc[row_i]['source_file_text']
            
        features_values = []

        for n_gram in n_grams:
            containment_value = calculate_containment(n_gram, suspicious_text, source_text)
            features_values.append(containment_value)

        lcs_value = lcs_norm_word(suspicious_text, source_text)
        features_values.append(lcs_value)
        df_x = pd.DataFrame([features_values], columns = col_names)

        df_features = df_features.append(df_x, ignore_index = True)
        
    return df_features

In [9]:
def make_csv(x, y, file_name, file_dir):
    
    # in this concatenation the first column will be my labels, the remaining columns are features
    pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1).to_csv(os.path.join(file_dir, file_name), header=False, index=False)

    print('file created')

In [11]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining ====== Start Time =", current_time)
# train dataset
csv_file_train = 'data2/train-complete-text-pair-sentences.csv'

df_transformed_train = numerical_dataframe(csv_file_train)
df_transformed_train.drop(columns='obfuscation_types', axis=1, inplace=True)

df_transformed_train = df_preprocess(df_transformed_train)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining ====== End Time =", current_time)





In [12]:
df_transformed_train.head(3)

Unnamed: 0,suspicious_reference,suspicious_offset,suspicious_lengths,source_references,source_offsets,source_lengths,plagiarism_types,suspicious_file_text,source_file_text,suspicious_text,source_text,Class,category,len_suspicious_text,len_source_text,num_words_suspicious_text,num_words_source_text
0,suspicious-document0170.txt,1120,198,source-document00054.txt,1150,198,no-obfuscation,صطف سبع علم علم فكر سلم رءد رود نهض سلم حدث ول...,عمر سحه عرض فرس وجه عقد امر جند كوف منح رقم جن...,فلا تفضح أمرنا أيها السراج - جلال الدين الرومي...,فلا تفضح أمرنا أيها السراج - جلال الدين الرومي...,1,2,1978,13453,484,3262
1,suspicious-document0171.txt,4081,1086,source-document00042.txt,4590,1086,no-obfuscation,عرف علم كبر ساذ حمد شكر رحم اله كثر ثلث سنه سه...,عبر موسيقار رحل حمد عبد وهب رءد غنء مصر وطن عر...,وفي عام 32 كان عبد الوهاب قد نضج واشتهر في كل ...,وفي عام 32 كان عبد الوهاب قد نضج واشتهر في كل ...,1,2,8879,3643,2164,879
2,suspicious-document0172.txt,4058,138,source-document00066.txt,11529,138,no-obfuscation,رحل فنن خلف قطن عبد اله قصر يغب سحه شكل كوت رء...,عمل حيه رفق احا قسو احا اخر اقس قست الي رفق رج...,"لا أعرف لها اسما, أو مدرسة, أو اسم قرية, ولكنن...","لا أعرف لها اسما, أو مدرسة, أو اسم قرية, ولكنن...",1,2,3353,7770,818,1869


In [13]:
counts_per_plagiarism_type = df_transformed_train.groupby(['plagiarism_types']).size().reset_index(name="Counts")
display(counts_per_plagiarism_type)

Unnamed: 0,plagiarism_types,Counts
0,artificial-obfuscation,838
1,no-obfuscation,696
2,no-plagiarism,169
3,simulated-obfuscation,191


In [14]:
df_features_train_artificial_obfuscation = df_transformed_train.copy()
df_features_train_artificial_obfuscation = df_features_train_artificial_obfuscation[df_features_train_artificial_obfuscation['plagiarism_types']=='artificial-obfuscation']
print(df_features_train_artificial_obfuscation.shape)

(838, 17)


In [15]:
df_features_train_no_obfuscation = df_transformed_train.copy()
df_features_train_no_obfuscation = df_features_train_no_obfuscation[df_features_train_no_obfuscation['plagiarism_types']=='no-obfuscation']
print(df_features_train_no_obfuscation.shape)

(696, 17)


In [16]:
df_features_train_simulated_obfuscation = df_transformed_train.copy()
df_features_train_simulated_obfuscation = df_features_train_simulated_obfuscation[df_features_train_simulated_obfuscation['plagiarism_types']=='simulated-obfuscation']
print(df_features_train_simulated_obfuscation.shape)

(191, 17)


In [18]:
df_features_train_no_plagiarism	 = df_transformed_train.copy()
df_features_train_no_plagiarism = df_features_train_no_plagiarism[df_features_train_no_plagiarism['plagiarism_types']=='no-plagiarism']
print(df_features_train_no_plagiarism.shape)

(169, 17)


In [19]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining _no_plagiarism ====== Start Time =", current_time)

df_features_train_no_plagiarism = perform_task(df_features_train_no_plagiarism)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining _no_plagiarism ====== End Time =", current_time)





In [20]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining simulated_obfuscation ====== Start Time =", current_time)

df_features_train_simulated_obfuscation = perform_task(df_features_train_simulated_obfuscation)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining simulated_obfuscation ====== End Time =", current_time)





In [21]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining no_obfuscation ====== Start Time =", current_time)

df_features_train_no_obfuscation = perform_task(df_features_train_no_obfuscation)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining no_obfuscation ====== End Time =", current_time)





In [22]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining artificial_obfuscation ====== Start Time =", current_time)

df_features_train_artificial_obfuscation = perform_task(df_features_train_artificial_obfuscation)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTraining artificial_obfuscation ====== End Time =", current_time)





In [23]:
df_features_train = pd.concat([df_features_train_artificial_obfuscation, df_features_train_no_obfuscation, df_features_train_simulated_obfuscation, df_features_train_no_plagiarism], axis=0)

In [24]:
df_features_train.head()

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,lcs_value
0,0.399149,0.015198,0.0,0.0,0.0,0.0,0.0,0.09356
1,0.437956,0.018265,0.000914,0.0,0.0,0.0,0.0,0.099453
2,0.343234,0.056291,0.009967,0.003333,0.0,0.0,0.0,0.161716
3,0.235294,0.016393,0.0,0.0,0.0,0.0,0.0,0.084967
4,0.514151,0.004739,0.0,0.0,0.0,0.0,0.0,0.084906


In [25]:
selected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

train_x = df_features_train[selected_features].values
train_y = df_transformed_train['Class'].values

file_name_train = 'train.csv'
file_dir_train = 'data2/'

make_csv(train_x, train_y, file_name_train, file_dir_train)

file created


In [26]:
train_x = df_features_train.values
train_y = df_transformed_train['Class'].values

file_name_train = 'train_all.csv'
file_dir_train = 'data2/'

make_csv(train_x, train_y, file_name_train, file_dir_train)

file created


In [27]:
selected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

train_x = df_features_train[selected_features].values
train_y = df_transformed_train['category'].values

file_name_train = 'train_multi_class.csv'
file_dir_train = 'data2/'

make_csv(train_x, train_y, file_name_train, file_dir_train)

file created


In [28]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\nTesting df_preprocess ====== Start Time =", current_time)

# test dataset
csv_file_test = 'data2/test-complete-text-pair-sentences.csv'

df_transformed_test = numerical_dataframe(csv_file_test)
df_transformed_test.drop(columns='obfuscation_types', axis=1, inplace=True)

df_transformed_test = df_preprocess(df_transformed_test)

current_time = now.strftime("%H:%M:%S")
print("\nTesting df_preprocess ====== End Time =", current_time)





In [29]:
df_features_test_artificial_obfuscation = df_transformed_test.copy()
df_features_test_artificial_obfuscation = df_features_test_artificial_obfuscation[df_features_test_artificial_obfuscation['plagiarism_types']=='artificial-obfuscation']
print(df_features_test_artificial_obfuscation.shape)

(840, 17)


In [30]:
df_features_test_no_obfuscation = df_transformed_test.copy()
df_features_test_no_obfuscation = df_features_test_no_obfuscation[df_features_test_no_obfuscation['plagiarism_types']=='no-obfuscation']
print(df_features_test_no_obfuscation.shape)

(696, 17)


In [31]:
df_features_test_simulated_obfuscation = df_transformed_test.copy()
df_features_test_simulated_obfuscation = df_features_test_simulated_obfuscation[df_features_test_simulated_obfuscation['plagiarism_types']=='simulated-obfuscation']
print(df_features_test_simulated_obfuscation.shape)

(191, 17)


In [32]:
df_features_test_no_plagiarism	 = df_transformed_test.copy()
df_features_test_no_plagiarism = df_features_test_no_plagiarism[df_features_test_no_plagiarism['plagiarism_types']=='no-plagiarism']
print(df_features_test_no_plagiarism.shape)

(169, 17)


In [33]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\ntesting _no_plagiarism ====== Start Time =", current_time)

df_features_test_no_plagiarism = perform_task(df_features_test_no_plagiarism)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\ntesting _no_plagiarism ====== End Time =", current_time)





In [34]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\ntesting simulated_obfuscation ====== Start Time =", current_time)

df_features_test_simulated_obfuscation = perform_task(df_features_test_simulated_obfuscation)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\ntesting simulated_obfuscation ====== End Time =", current_time)





In [35]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\ntesting no_obfuscation ====== Start Time =", current_time)

df_features_test_no_obfuscation = perform_task(df_features_test_no_obfuscation)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\ntesting no_obfuscation ====== End Time =", current_time)





In [36]:
now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\ntesting artificial_obfuscation ====== Start Time =", current_time)

df_features_test_artificial_obfuscation = perform_task(df_features_test_artificial_obfuscation)

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("\ntesting artificial_obfuscation ====== End Time =", current_time)





In [51]:
df_features_test = pd.concat([df_features_test_artificial_obfuscation, df_features_test_no_obfuscation, df_features_test_simulated_obfuscation, df_features_test_no_plagiarism], axis=0)

In [52]:
df_features_test.shape

(1896, 8)

In [65]:
selected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

test_x = df_features_test[selected_features].values
test_y = df_transformed_test['Class'].values

file_name_test = 'test.csv'
file_dir_test = 'data2/'

make_csv(test_x, test_y, file_name_test, file_dir_test)

file created


In [66]:
test_x.shape

(1896, 4)

In [67]:
test_x = df_features_test.values
test_y = df_transformed_test['Class'].values

file_name_test = 'test_all.csv'
file_dir_test = 'data2/'

make_csv(test_x, test_y, file_name_test, file_dir_test)

file created


In [68]:
elected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

test_x = df_features_test[selected_features].values
test_y = df_transformed_test['category'].values

file_name_test = 'test_multi_class.csv'
file_dir_test = 'data2/'

make_csv(test_x, test_y, file_name_test, file_dir_test)

file created
