In [2]:
import pandas as pd
import math
import os
import boto3
import gc
import torch

from binoculars import Binoculars

from tqdm import tqdm
from dotenv import load_dotenv
from io import StringIO

In [3]:
load_dotenv(f"../credentials.env")
s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('aws_access_key_id'),
    aws_secret_access_key=os.getenv('aws_secret_access_key')
)

In [4]:
def get_body_from_s3(
    bucket_name: str,
    file_key: str,
):

    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    body = obj['Body']
    return body

def stream_csv_from_s3(
    body,
    chunk_size: int = 1_000_000,
):
    newline = '\n'.encode()
    partial_chunk = b''

    while True:
        # Combine previous unfinished chunk with current.
        chunk = partial_chunk + body.read(chunk_size)

        # Exit if no more content
        if chunk == b'':
            break
        df = None
        # Find last newline tag
        last_newline = chunk.rfind(newline)
        while df is None:
          try:
            if last_newline == -1:
                partial_chunk = chunk
                continue
            result = chunk[:last_newline + 1].decode('utf-8')
            partial_chunk = chunk[last_newline + 1:]
            # Convert the chunk to a DataFrame
            df = pd.read_csv(StringIO(result))
          except Exception as e:
            last_newline = chunk.rfind(newline, 0, last_newline)
            continue
        yield last_new_linedf

    if partial_chunk:
        result = partial_chunk.decode('utf-8')
        df = pd.read_csv(StringIO(result))
        yield -1, df

def read_csv_from_s3(
    bucket_name: str,
    file_key: str,
):
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    body = obj['Body']
    data = body.read().decode('utf-8')
    
    # Use StringIO to convert the string data to a pandas-readable buffer
    data_buffer = StringIO(data)

    # Read the data into a pandas DataFrame
    df = pd.read_csv(data_buffer)
    return df

In [5]:
binoculars = Binoculars()





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

Some weights of FalconForCausalLM were not initialized from the model checkpoint at vilsonrodrigues/falcon-7b-instruct-sharded and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using pad_token, but it is not set yet.


# Evaluate Data

## Evaluation Settings

In [6]:
filenames = [
 # 'dataset1_test.csv', # Completed
 # 'dataset1_train.csv', # Completed
 # 'dataset2_test.csv', # Completed
 # 'dataset2_train.csv', # Completed
 # 'dataset3_test.csv', # Completed
 # 'dataset3_train.csv', # Completed
 # 'dataset4_test.csv', # Completed
 # 'dataset4_train.csv', # Completed
 # 'dataset5_test.csv', # Completed
 # 'dataset5_train.csv', # Completed
 # 'dataset6_test.csv', # Completed
 # 'dataset6_train.csv',
    'machine_generated.csv',
]

bucket_name = 'training-essays'
batch_resume_index = 1401
batch_size = 4

chunk_size = 500_000

In [7]:
torch.cuda.empty_cache()
gc.collect()

24

In [9]:
for file_number, file in enumerate(filenames):
    df = read_csv_from_s3(bucket_name, file)
    df_size = len(df)
    
    num_batches = math.ceil(len(df) / batch_size)
    
    print(f'Running evaluation for {file}.')
    for i, start_idx in tqdm(enumerate(range((batch_resume_index - 1) * batch_size, len(df), batch_size)), total=num_batches - batch_resume_index + 1):
        end_idx = min(start_idx + batch_size, df_size)
        batch_df = df.iloc[start_idx: end_idx]
        
        scores = binoculars.compute_score(batch_df['text'].to_list())

        batch_df = batch_df.copy(deep=True)
        batch_df.loc[:, 'binocular_score'] = scores
        batch_df.to_csv('temp_prediction_scores.csv', index=False, header=(i + batch_resume_index - 1 == 0), mode='w' if i + batch_resume_index - 1 == 0 else 'a')
        
        try:
          name = file.split('.')[0]
          s3.upload_file('temp_prediction_scores.csv', bucket_name, f'evaluations/{name}_predictions.csv')
        except Exception as e:
          print(f"\nError occurred: {e}. \n\nResume run with {file} and 'batch_resume_index' set to {i + batch_resume_index}.")
          raise
        # Clear GPU memory if using PyTorch
        
        torch.cuda.empty_cache()
        del batch_df, scores
        gc.collect()
    del df
    os.remove('temp_prediction_scores.csv')
print('Evaluation Complete!')

Running evaluation for machine_generated.csv.


100%|█████████████████████████████████████████████████████████████████████████████████| 350/350 [10:22<00:00,  1.78s/it]

Evaluation Complete!





In [None]:
read_csv_from_s3(bucket_name, 'evaluations/dataset1_test_predictions.csv')