In [1]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

import os

In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [2]:
# Random seed for reproducibility
seed = 10
np.random.seed(seed)

# Read in csv training dataset
dataset_dir = 'data2/'

In [3]:
training_file = 'train_multi_class.csv'
training_file = 'train.csv'

df_train = pd.read_csv(os.path.join(dataset_dir, training_file), header=None, names=None)

In [4]:
# Divide data into features X and target (Classes) Y
train_y = df_train.iloc[:,0]
train_x = df_train.iloc[:,1:]
print(train_x.shape)
print(train_y.shape)

(1894, 4)
(1894,)


In [5]:
# Read in csv training dataset
testing_file = 'test_multi_class.csv'
testing_file = 'test.csv'

df_test = pd.read_csv(os.path.join(dataset_dir, testing_file), header=None, names=None)

In [6]:
# Divide data into features X and target (Classes) Y
test_y = df_test.iloc[:,0]
test_x = df_test.iloc[:,1:]
print(test_x.shape)
print(test_y.shape)

(1896, 4)
(1896,)


In [23]:
model = Sequential()
model.add(Dense(25, activation='relu', input_shape=(4,)))
model.add(Dense(20, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model = Sequential()
model.add(Dense(100, input_dim=4, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [24]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [36]:
model.fit(train_x, train_y, epochs=300, verbose=0) #batch_size=10, 

<keras.callbacks.History at 0x258b76a3f70>

In [37]:
# pred_y = model.predict(test_x) 
# pred_classes = np.argmax(pred_y, axis=1)

In [38]:
# pred_classes

In [39]:
score = model.evaluate(test_x, test_y, verbose=1)



In [40]:
print(score)

[0.5946133136749268, 0.9171940684318542]


In [41]:
# predict probabilities for test set
yhat_probs = model.predict(test_x, verbose=0)

In [42]:
# predict crisp classes for test set
yhat_classes = (yhat_probs > 0.5).astype("int32")

In [43]:
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

In [44]:
test_y.values

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [45]:
yhat_classes

array([1, 1, 1, ..., 1, 1, 1])

In [46]:
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(test_y.values, yhat_classes)
print('Accuracy: %f' % (accuracy*100))
# precision tp / (tp + fp)
precision = precision_score(test_y.values, yhat_classes)
print('Precision: %f' % (precision*100))
# recall: tp / (tp + fn)
recall = recall_score(test_y.values, yhat_classes)
print('Recall: %f' % (recall*100))
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_y.values, yhat_classes)
print('F1 score: %f' % (f1*100))

Accuracy: 91.719409
Precision: 91.844350
Recall: 99.768384
F1 score: 95.642520


In [53]:
def calculate_metrics(model, test_x2, test_y2):
    # predict probabilities for test set
    yhat_probs = model.predict(test_x2, verbose=0)

    # predict crisp classes for test set
    yhat_classes = (yhat_probs > 0.5).astype("int32")

    # reduce to 1d array
    yhat_probs = yhat_probs[:, 0]
    yhat_classes = yhat_classes[:, 0]

    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(test_y2.values, yhat_classes)
    print('Accuracy: %f' % (accuracy*100))
    # precision tp / (tp + fp)
    precision = precision_score(test_y2.values, yhat_classes)
    print('Precision: %f' % (precision*100))
    # recall: tp / (tp + fn)
    recall = recall_score(test_y2.values, yhat_classes)
    print('Recall: %f' % (recall*100))
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(test_y2.values, yhat_classes)
    print('F1 score: %f' % (f1*100))

In [60]:
def build_fit_model(train_x2, train_y2, n=100, num_epochs=300, batch_size2=0):
    model = Sequential()
    model.add(Dense(n, input_dim=4, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    if (batch_size2 == 0):
        model.fit(train_x2, train_y2, epochs=num_epochs, verbose=0) #batch_size=10,
    else:
        model.fit(train_x2, train_y2, epochs=num_epochs, verbose=0, batch_size=batch_size2)
    
    return model

In [55]:
model = build_fit_model(train_x, train_y, n=200, num_epochs=300)

In [56]:
calculate_metrics(model, test_x, test_y)

Accuracy: 91.772152
Precision: 91.893333
Recall: 99.768384
F1 score: 95.669073


In [57]:
model = build_fit_model(train_x, train_y, n=200, num_epochs=500)

In [58]:
calculate_metrics(model, test_x, test_y)

Accuracy: 91.983122
Precision: 92.134831
Recall: 99.710481
F1 score: 95.773081


In [78]:
model = build_fit_model(train_x, train_y, n=200, num_epochs=2500, batch_size2=0)
calculate_metrics(model, test_x, test_y)

Accuracy: 91.930380
Precision: 91.995731
Recall: 99.826288
F1 score: 95.751180


In [80]:
model = build_fit_model(train_x, train_y, n=300, num_epochs=500, batch_size2=0)
calculate_metrics(model, test_x, test_y)

Accuracy: 92.035865
Precision: 92.274678
Recall: 99.594673
F1 score: 95.795043


In [83]:
model = build_fit_model(train_x, train_y, n=300, num_epochs=2000, batch_size2=0)
calculate_metrics(model, test_x, test_y)

Accuracy: 91.455696
Precision: 91.644492
Recall: 99.710481
F1 score: 95.507488


In [85]:
# predict probabilities for test set
yhat_probs = model.predict(test_x, verbose=0)

# predict crisp classes for test set
yhat_classes = (yhat_probs > 0.5).astype("int32")

# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

In [86]:
target_names = ['no-plagiarism', 'plagiarism']

In [88]:
from sklearn.metrics import classification_report
print(classification_report(test_y, yhat_classes, target_names=target_names))

               precision    recall  f1-score   support

no-plagiarism       0.71      0.07      0.13       169
   plagiarism       0.92      1.00      0.96      1727

     accuracy                           0.91      1896
    macro avg       0.81      0.53      0.54      1896
 weighted avg       0.90      0.91      0.88      1896



In [92]:
model = build_fit_model(train_x, train_y, n=300, num_epochs=500, batch_size2=50)
calculate_metrics(model, test_x, test_y)

Accuracy: 91.666667
Precision: 91.795418
Recall: 99.768384
F1 score: 95.615982


In [93]:
# predict probabilities for test set
yhat_probs = model.predict(test_x, verbose=0)

# predict crisp classes for test set
yhat_classes = (yhat_probs > 0.5).astype("int32")

# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

print(classification_report(test_y, yhat_classes, target_names=target_names))

               precision    recall  f1-score   support

no-plagiarism       0.79      0.09      0.16       169
   plagiarism       0.92      1.00      0.96      1727

     accuracy                           0.92      1896
    macro avg       0.85      0.54      0.56      1896
 weighted avg       0.91      0.92      0.89      1896



In [94]:
model = build_fit_model(train_x, train_y, n=300, num_epochs=1000, batch_size2=100)
calculate_metrics(model, test_x, test_y)
print()

# predict probabilities for test set
yhat_probs = model.predict(test_x, verbose=0)

# predict crisp classes for test set
yhat_classes = (yhat_probs > 0.5).astype("int32")

# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

print(classification_report(test_y, yhat_classes, target_names=target_names))

Accuracy: 91.877637
Precision: 92.036344
Recall: 99.710481
F1 score: 95.719844

               precision    recall  f1-score   support

no-plagiarism       0.80      0.12      0.21       169
   plagiarism       0.92      1.00      0.96      1727

     accuracy                           0.92      1896
    macro avg       0.86      0.56      0.58      1896
 weighted avg       0.91      0.92      0.89      1896



In [96]:
calculate_metrics(model, test_x, test_y)
print()

# predict probabilities for test set
yhat_probs = model.predict(test_x, verbose=0)

# predict crisp classes for test set
yhat_classes = (yhat_probs > 0.5).astype("int32")

# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

print(classification_report(test_y, yhat_classes, target_names=target_names))

Accuracy: 91.877637
Precision: 92.036344
Recall: 99.710481
F1 score: 95.719844

               precision    recall  f1-score   support

no-plagiarism       0.80      0.12      0.21       169
   plagiarism       0.92      1.00      0.96      1727

     accuracy                           0.92      1896
    macro avg       0.86      0.56      0.58      1896
 weighted avg       0.91      0.92      0.89      1896



In [107]:
type(test_y)

pandas.core.series.Series

In [104]:
df_1 = pd.DataFrame(yhat_classes)

In [110]:
df_final = test_x.copy()

In [112]:
df_final['y_actual'] = test_y

In [113]:
df_final['y_predicted'] = pd.Series(yhat_classes)

In [114]:
df_final.head(2)

Unnamed: 0,1,2,3,4,y_actual,y_predicted
0,0.53562,0.119048,0.096515,0.229551,1,1
1,0.273102,0.022426,0.001835,0.054437,1,1


In [117]:
df_final.to_csv('data2/predicted_values.csv', header=False, index=False)

In [115]:
df_test_2 = pd.read_csv(os.path.join(dataset_dir, testing_file), header=None, names=None)

In [95]:
print('Train dataset\n')
calculate_metrics(model, train_x, train_y)
print()

# predict probabilities for train set
yhat_probs = model.predict(train_x, verbose=0)

# predict crisp classes for train set
yhat_classes = (yhat_probs > 0.5).astype("int32")

# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

print(classification_report(train_y, yhat_classes, target_names=target_names))

Train dataset

Accuracy: 99.841605
Precision: 99.941995
Recall: 99.884058
F1 score: 99.913018

               precision    recall  f1-score   support

no-plagiarism       0.99      0.99      0.99       169
   plagiarism       1.00      1.00      1.00      1725

     accuracy                           1.00      1894
    macro avg       0.99      1.00      1.00      1894
 weighted avg       1.00      1.00      1.00      1894

