# Plagiarism Detection Model

## Steps
* Upload data to S3. 
* Define a binary classification model and a training script.
* Train your model and deploy it.
* Evaluate your deployed classifier and answer some questions about your approach.

In [1]:
import pandas as pd
import boto3
import sagemaker

## Load Data to S3

In [2]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

## Upload training data to S3

In [5]:
# the name of directory created to save the features data
data_dir = 'plagiarism_data'

# set prefix  
prefix = 'plagiarism_project'

# upload all data to S3
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

### Test cell

In [6]:
# confirm that data is in S3 bucket
empty_check = []
for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    empty_check.append(obj.key)
    print(obj.key)

assert len(empty_check) !=0, 'S3 bucket is empty.'
print('Test passed!')

plagiarism_project/test.csv
plagiarism_project/train.csv
Test passed!


# SKLearn Model Creation

In [1]:
!pygmentize source_sklearn/train.py

[34mfrom[39;49;00m [04m[36m__future__[39;49;00m [34mimport[39;49;00m print_function

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m

[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mexternals[39;49;00m [34mimport[39;49;00m joblib

[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36msvm[39;49;00m [34mimport[39;49;00m SVC

[37m# Provided model load function[39;49;00m
[34mdef[39;49;00m [32mmodel_fn[39;49;00m(model_dir):
    [33m"""Load model from the model_dir. This is the same model that is saved[39;49;00m
[33m    in the main if statement.[39;49;00m
[33m    """[39;49;00m
    [36mprint[39;49;00m([33m"[39;49;00m[33mLoading model.[39;49;00m[33m"[39;49;00m)
    
    [37m# load using joblib[39;49;00m
    model = joblib.load(os.path.join(model_dir, [33m"

# Create the Estimator

In [45]:
from sagemaker.sklearn.estimator import SKLearn

output_path = 's3://{}/{}'.format(bucket, prefix)
estimator = SKLearn(entry_point = 'train.py',
                    source_dir = 'source_sklearn',
                    role = role,
                    train_instance_count = 1,
                    train_instance_type = 'ml.m4.xlarge',
                    sagemaker_session = sagemaker_session,
                    output_path = output_path,
                    )

## EXERCISE: Train the estimator

In [46]:
%%time
estimator.fit({'train': input_data})

2020-07-20 00:06:29 Starting - Starting the training job...
2020-07-20 00:06:31 Starting - Launching requested ML instances......
2020-07-20 00:07:45 Starting - Preparing the instances for training......
2020-07-20 00:08:57 Downloading - Downloading input data
2020-07-20 00:08:57 Training - Downloading the training image...
2020-07-20 00:09:17 Uploading - Uploading generated training model[34m2020-07-20 00:09:12,605 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-07-20 00:09:12,608 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-07-20 00:09:12,618 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-07-20 00:09:12,913 sagemaker-containers INFO     Module train does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-07-20 00:09:12,914 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-07-20 00:09:12,914 sagemaker-containers 

## EXERCISE: Deploy the trained model

In [47]:
%%time
from sagemaker.pytorch import PyTorchModel

# deploy model to create the predictor
predictor = estimator.deploy(instance_type='ml.m4.xlarge',
                                     initial_instance_count=1)

---------------!CPU times: user 256 ms, sys: 21.6 ms, total: 278 ms
Wall time: 7min 32s


# Evaluating The Model

In [48]:
import os

# read in test data, assuming it is stored locally
test_data = pd.read_csv(os.path.join(data_dir, "test.csv"), header=None, names=None)

# labels are in the first column
test_y = test_data.iloc[:,0]
test_x = test_data.iloc[:,1:]

## Determining the accuracy of the model

In [49]:
# First: generate predicted, class labels
test_y_preds = predictor.predict(test_x)

# test that the model generates the correct number of labels
assert len(test_y_preds)==len(test_y), 'Unexpected number of predictions.'
print('Test passed!')

Test passed!


In [52]:
# Second: calculate the test accuracy
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


accuracy = accuracy_score(test_y, test_y_preds)
print('accuracy:',accuracy)
print(classification_report(test_y.values, test_y_preds,))


## print out the array of predicted and true labels, if you want
print('\nPredicted class labels: ')
print(test_y_preds)
print('\nTrue class labels: ')
print(test_y.values)

accuracy: 0.96
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       1.00      0.93      0.97        15

   micro avg       0.96      0.96      0.96        25
   macro avg       0.95      0.97      0.96        25
weighted avg       0.96      0.96      0.96        25


Predicted class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0 1 1 0 0]

True class labels: 
[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0]


## Clean up Resources

In [53]:
# <name_of_deployed_predictor>.delete_endpoint()

### Deleting S3 bucket

In [54]:
# bucket_to_delete = boto3.resource('s3').Bucket(bucket)
# bucket_to_delete.objects.all().delete()

## NLP+Classification for Plagiarism Detection Successful!

I hope you enjoyed this project as much as I did!!

Thanks for viewing :)

You can always message me on Github for more information on the process I used!