In [2]:
import boto3
import os
import json
import pandas as pd


from io import StringIO
from flask import Flask, request, jsonify
from dotenv import load_dotenv
from binoculars import Binoculars

In [3]:
binoculars = Binoculars()





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of FalconForCausalLM were not initialized from the model checkpoint at vilsonrodrigues/falcon-7b-instruct-sharded and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using pad_token, but it is not set yet.


In [4]:
load_dotenv(f"../credentials.env")
s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('aws_access_key_id'),
    aws_secret_access_key=os.getenv('aws_secret_access_key')
)

In [None]:
app = Flask(__name__)

@app.route('/predict', methods=['POST','GET'])
def predict():
    # Run binocular model on input files
    data = request.get_json()
    response = data.get('text', 'No text provided')  # Retrieve the 'text' field from the JSON payload
    if response != 'No text provided':
        score_vec = binoculars.compute_score(response)
    res_lst = [] 
    
    # Filter on threshold discovered in our datasets
    for s in score_vec: 
        res_lst.append('Potential MGT Detected' if s < 0.8676735347069413 else 'No MGT Detected')
    
    return jsonify({'response': res_lst})

@app.route('/archive', methods=['PUT'])
def archive():
    # Store the summary of the binoculars model on file(s)
    bucket_name = 'authen-text-archive'
    data = request.get_json()
    paths, mgt_status, text = data.get('filenames', ''), data.get('mgt_status'), data.get('text')
    
    for idx, path in enumerate(paths):
        data_dict = {'text': text[idx], 'mgt_status': mgt_status[idx]}
        data_string = json.dumps(data_dict , default=str)
        
        # Upload JSON String to an S3 Object
        s3.put_object(
            Bucket=bucket_name, 
            Key=f'{path}.json',
            Body=data_string
        )

    status = 'Successfully Stored in Archive'
    return jsonify({'response': status})
    
@app.route('/view_archive', methods=['GET'])
def view_archive():
    
    # Display the context of the archive bucket to user
    try:
        # Initialize a session using your AWS credentials
        s3_client = boto3.client(
            's3',
            aws_access_key_id=os.getenv('aws_access_key_id'),
            aws_secret_access_key=os.getenv('aws_secret_access_key')
        )

        s3_resource = boto3.resource(
                        's3',
                        aws_access_key_id=os.getenv('aws_access_key_id'),
                        aws_secret_access_key=os.getenv('aws_secret_access_key')
                    )
        
        bucket_name = 'authen-text-archive'
    
        # Listing out the objects in a bucket
        essay_trainning_bucket = s3_resource.Bucket(name = bucket_name)
        keys = [object.key for object in essay_trainning_bucket.objects.all()]

        series_lst = []

        for file_key in keys:
            # Download the file from S3 to a string buffer
            obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
            data = obj['Body'].read().decode('utf-8')
            
            
            # Use StringIO to convert the string data to a pandas-readable buffer
            data_buffer = StringIO(data)
            
            # Read the data into a pandas DataFrame
            series = pd.read_json(data_buffer, typ='series')
            
            series['text'] = series['text'][:60] if len(series['text']) > 60 else series['text']
            series['mgt_status'] = 'Yes' if series['mgt_status'] else 'No'
            series['Document'] = file_key.split('.')[0].split('__')[1]
            series['Student ID'] = file_key.split('/')[0]

            series_lst.append(series)
        
        df = pd.DataFrame(series_lst)
        df = df[['Student ID', 'Document', 'text', 'mgt_status']]
        df.rename(columns={'mgt_status':'Potential MGT Detected'}, inplace=True)
        
        return jsonify({'response': df.to_dict()})
        
    except Exception as e:
        print(e)


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.31.41.150:5000
Press CTRL+C to quit
54.198.173.193 - - [08/Aug/2024 22:50:44] "GET /view_archive HTTP/1.1" 200 -
54.198.173.193 - - [08/Aug/2024 22:51:43] "POST /predict HTTP/1.1" 200 -
54.198.173.193 - - [08/Aug/2024 22:51:49] "PUT /archive HTTP/1.1" 200 -
54.198.173.193 - - [08/Aug/2024 22:51:55] "GET /view_archive HTTP/1.1" 200 -
198.235.24.24 - - [08/Aug/2024 23:28:11] "GET / HTTP/1.0" 404 -
54.198.173.193 - - [08/Aug/2024 23:36:52] "GET /view_archive HTTP/1.1" 200 -
54.198.173.193 - - [08/Aug/2024 23:37:18] "POST /predict HTTP/1.1" 200 -
54.198.173.193 - - [08/Aug/2024 23:37:27] "PUT /archive HTTP/1.1" 200 -
54.198.173.193 - - [08/Aug/2024 23:37:34] "GET /view_archive HTTP/1.1" 200 -
