In [None]:
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from functools import lru_cache
import json
import logging

# Switch to Google's generative AI library
from google.generativeai import models
from google.api_core.exceptions import ResourceExhausted

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')


In [None]:
#load all abstracts
df = pd.read_csv('all_abstracts_screen.csv')

In [None]:
df

In [None]:
# Function to get the API key (adapt as needed for Gemini)
def get_api_key():
    # Hardcoded path to the secrets directory
    secrets_dir = '/Users/chrisjanssen/Insync/cnjanssen@tamu.edu/Google Drive/COM/Research/spaceflightreview-secrets'

    # Construct the full path to the secrets.txt file
    secret_file = os.path.join(secrets_dir, 'gemini_secrets.txt')

    # Read the API key from the file
    try:
        with open(secret_file, 'r') as file:
            api_key = file.read().strip()
        return api_key
    except FileNotFoundError:
        print(f"Error: secrets.txt file not found in the secrets directory: {secrets_dir}")
        return None
    except IOError:
        print(f"Error: Unable to read the secrets.txt file: {secret_file}")
        return None

# Get the Gemini API key
gemini_api_key = get_api_key()

if gemini_api_key:
    print("API key successfully loaded.")
    # Correct way to configure the API key
    os.environ["GOOGLE_API_KEY"] = gemini_api_key
    print(f"api key:{gemini_api_key}")
else:
    print("Failed to load API key.")



In [None]:
# Initial token count and limit (adjust based on Gemini's limits)
running_token_count = 0
TOKEN_LIMIT = 1000000  # Adjust as needed for Gemini Flash
last_reset_time = time.time()
import google.generativeai as genai

#create model
model = genai.GenerativeModel('gemini-1.5-flash')
genai.configure(api_key=gemini_api_key)
response = model.generate_content("The opposite of hot is")
#os.environ["GOOGLE_API_KEY"] = gemini_api_key


In [None]:
print(response.text)

In [None]:
import os
import pandas as pd
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from functools import lru_cache
import json
import logging

# Switch to Google's generative AI library
from google.generativeai import models
from google.api_core.exceptions import ResourceExhausted

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Set up the API key for Google Gemini
os.environ['GOOGLE_API_KEY'] = 'YOUR_GEMINI_API_KEY'  # Replace with your actual key

# Cache the prompt
@lru_cache(maxsize=1)
def get_cached_prompt():
    return """You are assisting in screening articles for a scoping review on medical screening for Commercial Spaceflight (CSF). Your task is to determine whether to include, exclude, or mark as "maybe" each article based on the provided criteria. 
Here are the key inclusion criteria:
- Timeframe: 2000-2023
- Setting: Spaceflight experience/profile and/or analogues/simulation (e.g., suborbital, orbital, lunar, planetary, trans-atmospheric, parabolic, centrifuge, bed rest, dry immersion, head-down tilt)
- Human medical conditions related to CSF
- Language: English in full text
- Sample: Adult human participants (18 years and older) but who are commercial spaceflight participants or passengers who DO NOT meet career professional astronaut medical standards and training and who DOES NOT HAVE primary duties with operation or safety of flgiht of aircraft
- Peer-reviewed articles relevant to Commercial Spaceflight
- NASA RIDGE hazards of space flight such as 
--Radiation
--Isolation and confinement
--Distance from Earth
--Gravity (differing gravitational fields)
--Environment (hostile/closed)

Key exclusion criteria:
- Animal, in-silico, or in-vitro studies
- Pediatrics or age less than 18 years
- Full text not available in English
- Articles limited to professional astronauts or equivalent
- Gray literature, dissertations, theses, technical reports, proprietary information
- Editorials, magazine articles, or web-based/digital media
- Textbooks

Please review the following article information:

Title: <article_title>{{Title}}</article_title>

Year: <article_year>{{Published_Year}}</article_year>

Abstract: <article_abstract>{{Abstract}}</article_abstract>

Based on this information, determine whether the article should be included, excluded, or marked as "maybe" for the scoping review. Consider the inclusion and exclusion criteria carefully.

First, provide a brief justification for your decision, explaining how the article meets or fails to meet the criteria. Then, state your decision.

Format your response as follows:
<justification>
[Your justification here]
</justification>

<decision>[INCLUDE/EXCLUDE/MAYBE]</decision>"""



In [None]:
# Function to estimate tokens (simplified)
def estimate_tokens(text):
    return len(str(text).split())

def process_article_with_retry(title, year, abstract, max_retries=3):
    for attempt in range(max_retries):
        try:
            logging.debug(f"Attempting to process article (Attempt {attempt + 1})")
            return process_article(title, year, abstract)
        except Exception as e:
            logging.error(f"Attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                logging.warning(f"Max retries reached. Skipping article.")
                return "Error", 0
            time.sleep(2 ** attempt)  # Exponential backoff

In [None]:
def process_article(title, year, abstract):
    global running_token_count, last_reset_time

    # Handle potential None values
    title = str(title) if pd.notna(title) else ""
    year = str(year) if pd.notna(year) else ""
    abstract = str(abstract) if pd.notna(abstract) else ""

    prompt = get_cached_prompt().replace("{{Title}}", title).replace("{{Published_Year}}", year).replace("{{Abstract}}", abstract)

    tokens_used = estimate_tokens(prompt)
    current_time = time.time()

    # Check if a minute has passed since the last reset
    if current_time - last_reset_time >= 60:
        logging.debug("Resetting token count")
        running_token_count = 0
        last_reset_time = current_time

    # Check if adding these tokens would exceed the limit
    if running_token_count + tokens_used > TOKEN_LIMIT:
        sleep_time = 60 - (current_time - last_reset_time)
        if sleep_time > 0:
            logging.debug(f"Sleeping for {sleep_time} seconds to avoid rate limit")
            time.sleep(sleep_time)

        # Reset token count after the sleep
        running_token_count = 0
        last_reset_time = time.time()

    running_token_count += tokens_used

    # Call the Gemini API
    try:
        response = model.generate_content(
            prompt,
            generation_config=genai.GenerationConfig(
                temperature=0,
                max_output_tokens=3000
            )
        )
        print(response.text)
        return response.text, tokens_used
    except Exception as e:
        logging.error(f"Error processing article: {e}")
        return f"Error: {str(e)}", 0

In [None]:
# Process all articles in the dataframe
def process_all_articles(df):
    results = []
    tokens_used_list = []
    progress_bar = tqdm(total=len(df), desc="Processing Articles")

    for i, row in df.iterrows():
        try:
            logging.info(f"Processing article {i+1}/{len(df)}")
            result, tokens_used = process_article_with_retry(row['Title'], row['Published Year'], row['Abstract'])
            results.append(str(result))
            tokens_used_list.append(tokens_used)
            logging.info(f"Successfully processed article {i+1}")
        except Exception as e:
            logging.error(f"Error processing row {i}: {e}")
            results.append("Error")
            tokens_used_list.append(0)

        progress_bar.update(1)

        if (i + 1) % 50 == 0:
            logging.info(f"Checkpoint: Processed {i+1} articles")
            logging.debug(f"Current results length: {len(results)}")
            logging.debug(f"Current tokens_used_list length: {len(tokens_used_list)}")
            save_intermediate_results(df.iloc[:i+1], results, tokens_used_list, i+1)

        time.sleep(1)

    progress_bar.close()

    # Add results to the DataFrame
    df['AI_Decision'] = results
    df['Tokens_Used'] = tokens_used_list

    return df

def save_intermediate_results(df_slice, results, tokens_used_list, num_processed):
    temp_df = df_slice.copy()
    if len(results) != len(temp_df):
        logging.warning(f"Mismatch in lengths. DataFrame: {len(temp_df)}, Results: {len(results)}")
        # Pad results and tokens_used_list if necessary
        results += ["Error"] * (len(temp_df) - len(results))
        tokens_used_list += [0] * (len(temp_df) - len(tokens_used_list))
    temp_df['AI_Decision'] = results[:len(temp_df)]
    temp_df['Tokens_Used'] = tokens_used_list[:len(temp_df)]
    temp_df.to_parquet(f'pt2_intermediate_results_{num_processed}.parquet')
    logging.info(f"Saved pt2 intermediate results for {num_processed} articles")



# Function to plot progress
def plot_progress(df):
    plt.figure(figsize=(12, 6))
    plt.plot(df['Tokens_Used'].cumsum())
    plt.title('Cumulative Tokens Used')
    plt.xlabel('Article Index')
    plt.ylabel('Total Tokens')
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(12, 6))
    df['AI_Decision'].value_counts().plot(kind='bar')
    plt.title('Distribution of AI Decisions')
    plt.xlabel('Decision')
    plt.ylabel('Count')
    plt.show()

In [None]:
df = process_all_articles(df)
plot_progress(df)