In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

from sklearn.feature_extraction.text import CountVectorizer

import helper_Arabic

In [2]:
# Read in a csv file and return a transformed dataframe
def numerical_dataframe(csv_file):    
    df = pd.read_csv(csv_file, index_col=0)
    
    df['Class'] = df['plagiarism_types'].map({'no-plagiarism':0,
                                      'artificial-obfuscation':1,
                                      'no-obfuscation':1,
                                      'simulated-obfuscation':1})
    
    df['category'] = df['plagiarism_types']
    
    df['category'] = df['category'].map({'no-plagiarism':0,
                                      'artificial-obfuscation':1,
                                      'no-obfuscation':2,
                                      'simulated-obfuscation':3})
    
    return df

In [3]:
# Calculate the ngram containment for one answer file/source file pair in a df
def calculate_containment(n, suspicious_text, source_text):    
    #count the n-grams
    counts = CountVectorizer(analyzer='word', ngram_range=(n,n))
    ngrams_array = counts.fit_transform([suspicious_text, source_text]).toarray()
    
    #containment calculation
    containment = (np.minimum(ngrams_array[0],ngrams_array[1]).sum())/(ngrams_array[0].sum())
    
    return containment

In [4]:
# Compute the normalized LCS given an suspicious text and a source text
def lcs_norm_word(suspicious_text, source_text):
    '''Computes the longest common subsequence of words in two texts; returns a normalized value.
       :param suspicious_text: The pre-processed text for an answer text
       :param source_text: The pre-processed text for an answer's associated source text
       :return: A normalized LCS value'''
    
    #split the strings into words using split() and whitespace as a separator
    suspicious_words = suspicious_text.split()
    source_words = source_text.split()
    
    #getting the word counts
    suspicious_w_counts = len(suspicious_words)
    source_w_counts = len(source_words)
    
    #instantiating a matrix adding extra row and column of zeros
    lcs_matrix = np.zeros((source_w_counts + 1, suspicious_w_counts + 1), dtype=int)
    
    #fillling up the matrix based on matches
    for s, s_word in enumerate(source_words, 1):
        for a, a_word in enumerate(suspicious_words, 1):
            if s_word == a_word:
                lcs_matrix[s][a] = lcs_matrix[s-1][a-1] + 1
            else:
                lcs_matrix[s][a] = max(lcs_matrix[s-1][a], lcs_matrix[s][a-1])
    
    lcs = lcs_matrix[source_w_counts][suspicious_w_counts]        
        
    return lcs/suspicious_w_counts

In [5]:
csv_file = 'data/train-pair-sentences.csv'

In [6]:
df_transformed = numerical_dataframe(csv_file)
df_transformed.drop(columns='obfuscation_types', axis=1, inplace=True)
#df_transformed = df_transformed.head(10)

In [7]:
counts_per_plagiarism_type = df_transformed.groupby(['plagiarism_types']).size().reset_index(name="Counts")
display(counts_per_plagiarism_type)

Unnamed: 0,plagiarism_types,Counts
0,artificial-obfuscation,838
1,no-obfuscation,696
2,no-plagiarism,169
3,simulated-obfuscation,191


In [35]:
col_name = 'suspicious_text'

df_transformed = helper_Arabic.complete_preprocess(df_transformed, col_name)

In [36]:
col_name = 'source_text'

df_transformed = helper_Arabic.complete_preprocess(df_transformed, col_name)

In [37]:
def len_text(text):
    str_text = str(text)
    len_text = len(str_text)
    
    return len_text

In [48]:
def num_words(text):
    return len(text.split())

In [38]:
df_transformed['len_suspicious_text'] = df_transformed['suspicious_text'].apply(lambda x: len_text(x))

In [39]:
df_transformed['len_source_text'] = df_transformed['source_text'].apply(lambda x: len_text(x))

In [49]:
df_transformed['num_words_suspicious_text'] = df_transformed['suspicious_text'].apply(lambda x: num_words(x))

In [50]:
df_transformed['num_words_source_text'] = df_transformed['source_text'].apply(lambda x: num_words(x))

In [57]:
def perform_task(df):
    
    df_temp = df.copy()
    #df_temp = df_temp[df_temp['plagiarism_types'] == plagiarism_type]
    #df_temp = df_temp.head(10)
    df_temp.reset_index(drop=True, inplace=True)
    print(df_temp.shape)

    n_grams = range(1,8)

    containment_values = []
    lcs_values = []
    col_names = []

    for n_gram in n_grams:
        col_name = 'c_' + str(n_gram)
        col_names.append(col_name)

    col_names.append('lcs_value')

    df_features = pd.DataFrame(columns = col_names)

    for row_i in df_temp.index:
        suspicious_text = df_temp.iloc[row_i]['suspicious_text']
        source_text = df_temp.iloc[row_i]['source_text']
        
        if len(suspicious_text) < 5:
            print(row_i)
            continue

        if len(source_text) < 5:
            print(row_i)
            continue
            
        features_values = []

        for n_gram in n_grams:
            containment_value = calculate_containment(n_gram, suspicious_text, source_text)
            features_values.append(containment_value)

        lcs_value = lcs_norm_word(suspicious_text, source_text)
        features_values.append(lcs_value)
        df_x = pd.DataFrame([features_values], columns = col_names)

        df_features = df_features.append(df_x, ignore_index = True)
        
    return df_features

In [58]:
df_features_artificial_obfuscation = perform_task(df_transformed)

(1894, 9)


In [62]:
df_features = df_features_artificial_obfuscation.copy()

In [63]:
corr_matrix = df_features.corr().abs().round(2)

In [64]:
corr_matrix

Unnamed: 0,c_1,c_2,c_3,c_4,c_5,c_6,c_7,lcs_value
c_1,1.0,0.37,0.36,0.36,0.36,0.36,0.36,0.34
c_2,0.37,1.0,0.99,0.98,0.96,0.94,0.93,0.91
c_3,0.36,0.99,1.0,1.0,0.99,0.97,0.96,0.92
c_4,0.36,0.98,1.0,1.0,1.0,0.99,0.98,0.92
c_5,0.36,0.96,0.99,1.0,1.0,1.0,0.99,0.92
c_6,0.36,0.94,0.97,0.99,1.0,1.0,1.0,0.92
c_7,0.36,0.93,0.96,0.98,0.99,1.0,1.0,0.91
lcs_value,0.34,0.91,0.92,0.92,0.92,0.92,0.91,1.0


In [65]:
selected_features = ['c_1', 'c_2', 'c_7', 'lcs_value']

In [66]:
train_x = df_features[selected_features].values

In [69]:
train_y = df_transformed['Class'].values

In [71]:
train_y.shape

(1894,)

In [73]:
def make_csv(x, y, filename):   
    
    # in this concatenation the first column will be my labels, the remaining columns are features
    pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1).to_csv(filename, header=False, index=False)

    print('file created')

In [75]:
file_name = 'train.csv'
make_csv(train_x, train_y, file_name)

file created


In [None]:
df_transformed

In [None]:
df_features

In [None]:
n_grams