In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

import os

In [2]:
# Random seed for reproducibility
seed = 10
np.random.seed(seed)

# Read in csv training dataset
dataset_dir = 'data2/'

In [3]:
training_file = 'train_multi_class.csv'
#training_file = 'train.csv'

df_train = pd.read_csv(os.path.join(dataset_dir, training_file), header=None, names=None)

In [4]:
# Divide data into features X and target (Classes) Y
train_y = df_train.iloc[:,0]
train_x = df_train.iloc[:,1:]
print(train_x.shape)
print(train_y.shape)

(1894, 4)
(1894,)


In [7]:
# Read in csv training dataset
testing_file = 'test_multi_class.csv'
#testing_file = 'test.csv'

df_test = pd.read_csv(os.path.join(dataset_dir, testing_file), header=None, names=None)

In [8]:
# Divide data into features X and target (Classes) Y
test_y = df_test.iloc[:,0]
test_x = df_test.iloc[:,1:]
print(test_x.shape)
print(test_y.shape)

(1896, 4)
(1896,)


In [9]:
# First define baseline model. Then use it in Keras Classifier for the training
def baseline_model():
    # Create model here
    model = Sequential()
    model.add(Dense(4, input_shape=(4,), activation = 'relu')) # Rectified Linear Unit Activation Function
    model.add(Dense(4, activation = 'relu'))
    model.add(Dense(4, activation = 'softmax')) # Softmax for multi-class classification
    # Compile model here
    model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

model = baseline_model()

In [122]:
model.fit(train_x, train_y, epochs=100, batch_size=10, verbose=0)

<keras.callbacks.History at 0x204c4d28a60>

In [123]:
#model.summary()

In [124]:
#pred_y = model.predict_classes(test_x)
pred_y = model.predict(test_x) 
pred_classes1 = np.argmax(pred_y, axis=1)

In [126]:
score = model.evaluate(test_x, test_y, verbose=1)



In [127]:
from sklearn.metrics import classification_report
print(classification_report(test_y, pred_classes1, target_names=target_names))

                        precision    recall  f1-score   support

         no-plagiarism       0.90      0.47      0.62       169
artificial-obfuscation       0.47      0.52      0.50       840
        no-obfuscation       0.33      0.41      0.36       696
 simulated-obfuscation       0.00      0.00      0.00       191

              accuracy                           0.43      1896
             macro avg       0.42      0.35      0.37      1896
          weighted avg       0.41      0.43      0.41      1896



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [130]:
model.fit(train_x, train_y, epochs=100, batch_size=50, verbose=0)

#pred_y = model.predict_classes(test_x)
pred_y = model.predict(test_x) 
pred_classes1 = np.argmax(pred_y, axis=1)

score = model.evaluate(test_x, test_y, verbose=1)
print()

print(classification_report(test_y, pred_classes1, target_names=target_names))


                        precision    recall  f1-score   support

         no-plagiarism       0.88      0.55      0.68       169
artificial-obfuscation       0.47      0.49      0.48       840
        no-obfuscation       0.33      0.44      0.38       696
 simulated-obfuscation       0.00      0.00      0.00       191

              accuracy                           0.43      1896
             macro avg       0.42      0.37      0.38      1896
          weighted avg       0.41      0.43      0.41      1896



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [133]:
model.fit(train_x, train_y, epochs=100, batch_size=50, verbose=0)

#pred_y = model.predict_classes(test_x)
pred_y = model.predict(test_x) 
pred_classes1 = np.argmax(pred_y, axis=1)

score = model.evaluate(test_x, test_y, verbose=1)
print()

print(classification_report(test_y, pred_classes1, target_names=target_names))


                        precision    recall  f1-score   support

         no-plagiarism       0.85      0.26      0.40       169
artificial-obfuscation       0.47      0.50      0.48       840
        no-obfuscation       0.32      0.44      0.37       696
 simulated-obfuscation       0.00      0.00      0.00       191

              accuracy                           0.40      1896
             macro avg       0.41      0.30      0.31      1896
          weighted avg       0.40      0.40      0.38      1896



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [134]:
model.fit(train_x, train_y, epochs=100, batch_size=30, verbose=0)

#pred_y = model.predict_classes(test_x)
pred_y = model.predict(test_x) 
pred_classes1 = np.argmax(pred_y, axis=1)

score = model.evaluate(test_x, test_y, verbose=1)
print()

print(classification_report(test_y, pred_classes1, target_names=target_names))


                        precision    recall  f1-score   support

         no-plagiarism       0.87      0.31      0.45       169
artificial-obfuscation       0.48      0.48      0.48       840
        no-obfuscation       0.33      0.46      0.38       696
 simulated-obfuscation       0.00      0.00      0.00       191

              accuracy                           0.41      1896
             macro avg       0.42      0.31      0.33      1896
          weighted avg       0.41      0.41      0.39      1896



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [137]:
model.fit(train_x, train_y, epochs=200, batch_size=5, verbose=0)

#pred_y = model.predict_classes(test_x)
pred_y = model.predict(test_x) 
pred_classes1 = np.argmax(pred_y, axis=1)

score = model.evaluate(test_x, test_y, verbose=1)
print()

print(classification_report(test_y, pred_classes1, target_names=target_names))


                        precision    recall  f1-score   support

         no-plagiarism       0.95      0.11      0.20       169
artificial-obfuscation       0.48      0.49      0.48       840
        no-obfuscation       0.32      0.47      0.38       696
 simulated-obfuscation       0.00      0.00      0.00       191

              accuracy                           0.40      1896
             macro avg       0.44      0.27      0.27      1896
          weighted avg       0.41      0.40      0.37      1896



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
