<a href="https://colab.research.google.com/github/danielhcg/AntiSmishGPT/blob/main/startOverGemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import google.generativeai as genai
import os
from google.colab import userdata
import pandas as pd
import time

In [2]:
api_key = userdata.get('JOE-GEMINI-KEY')
genai.configure(api_key=api_key)

model = genai.GenerativeModel('models/gemini-1.5-pro')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import time

# Function to process a batch of messages
def detect(messages):
    prompts = [f"Is this text message real or fake? '{message}' Respond with one word: real, fake, or undetermined, without any explanations." for message in messages]

    while True:
        response = model.generate_content(prompts)

        # Extract text from the response and split
        response_text = response.candidates[0].content.parts[0].text.strip()
        split_results = response_text.split('\n')

        # Further filter results to ensure only valid responses are considered
        valid_responses = ['real', 'fake', 'undetermined']
        results = [line.strip().lower() for line in split_results if line.strip().lower() in valid_responses]

        # Check if the results match the number of messages
        if len(results) == len(messages):
            return results
        else:
            print(f"Retrying: Expected {len(messages)} results but got {len(results)}. Retrying...")
            time.sleep(60)  # Sleep for 1 minute before retrying

# Function to process the CSV file
def process_csv(input_file):
    # Load the data into a pandas DataFrame without headers
    df = pd.read_csv(input_file, header=None)

    # To store results for each value
    authenticity_results = []

    # Determine the number of tokens per message
    def count_tokens(text):
        return len(text.split())

    # Calculate tokens for each message
    df['tokens'] = df[0].apply(count_tokens)

    # Process rows in batches of 20 messages
    batch_size = 20
    max_requests_per_day = 50
    max_tokens_per_minute = 32000
    max_requests_per_minute = 2

    requests_made = 0
    total_tokens_processed = 0

    for i in range(0, len(df), batch_size):
        if requests_made >= max_requests_per_day:
            print("Daily limit reached. Stopping execution.")
            break

        batch = df[0][i:i+batch_size].tolist()
        batch_tokens = df['tokens'][i:i+batch_size].sum()

        if total_tokens_processed + batch_tokens > max_tokens_per_minute:
            print("Token limit per minute reached. Waiting for the next minute.")
            time.sleep(60)  # Wait for the next minute
            total_tokens_processed = 0  # Reset the token count for the new minute

        try:
            batch_results = detect(batch)
        except Exception as e:
            print(f"Error during detection: {e}")
            batch_results = ['error'] * batch_size  # Fallback to error for all in case of error

        authenticity_results.extend(batch_results)
        total_tokens_processed += batch_tokens

        # Print status after processing each batch
        print(f"Processed inputs {i + 1} to {i + batch_size}, Results = {batch_results}")

        # Rate limiting: ensure only 2 requests per minute
        requests_made += 1
        if requests_made % max_requests_per_minute == 0:
            time.sleep(60)

    # Ensure the authenticity results list is the same length as the DataFrame
    if len(authenticity_results) < len(df):
        authenticity_results.extend([''] * (len(df) - len(authenticity_results)))

    # Assign results to new column
    df['authenticity'] = authenticity_results

    # Extract the directory from the input file path
    input_directory = os.path.dirname(input_file)
    output_file = os.path.join(input_directory, "processed_" + os.path.basename(input_file))

    # Save the updated results back to the same directory
    df.to_csv(output_file, index=False, header=False)

    print("Processing complete. Results saved to", output_file)

# Example usage
input_file = '/content/drive/My Drive/anti_smish_gpt/lastPart_smishTankSet.csv'  # Update with your file path
process_csv(input_file)


Processed inputs 1 to 20, Results = ['fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake']
Processed inputs 21 to 40, Results = ['fake', 'fake', 'fake', 'fake', 'fake', 'undetermined', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake']
Processed inputs 41 to 60, Results = ['fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'undetermined', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'fake', 'undetermined', 'fake']
Processed inputs 61 to 80, Results = ['fake', 'fake']
Processing complete. Results saved to /content/drive/My Drive/anti_smish_gpt/processed_lastPart_smishTankSet.csv
