In [1]:
import pandas as pd
import google.generativeai as genai
from tqdm import tqdm
import time
from google.colab import userdata

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:

# Configure Gemini API
def configure_gemini(api_key):
    """Configure the Gemini API with your API key."""
    genai.configure(api_key=api_key)
    return genai.GenerativeModel('gemini-1.5-flash')  # Using faster model




In [4]:
# Label generation prompt template
PROMPT_TEMPLATE = """
Please analyze the following academic paper and generate a unique, descriptive label that captures its main focus.
The label should be concise (3-5 words max) and suitable for use in a recommender system.

Paper Title: {title}

Abstract: {abstract}

Please provide only the label, nothing else. The label should be in the following format:
"label: [your generated label here]"
"""

In [5]:

def extract_label_from_response(response_text):
    """Extract the label from Gemini's response."""
    if "label:" in response_text.lower():
        return response_text.split("label:")[1].strip()
    return response_text.strip()


def generate_paper_label(model, title, abstract, max_retries=3):
    """Generate a label for a paper using Gemini."""
    # Handle empty titles
    if pd.isna(title) or str(title).strip() == "":
        title = "Untitled Paper"

    # Handle empty abstracts
    if pd.isna(abstract) or str(abstract).strip() == "":
        abstract = "No abstract available"

    prompt = PROMPT_TEMPLATE.format(title=title, abstract=abstract)

    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            label = extract_label_from_response(response.text)
            return label
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for paper '{title[:50]}...': {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(5)  # Wait before retrying
            else:
                print(f"Failed to generate label for paper '{title[:50]}...' after {max_retries} attempts")
                return None



In [6]:


def label_papers(dataframe, api_key, start_idx=2500, num_papers=2500, save_interval=100):
    """
    Modified to process papers from start_idx onwards

    Args:
        start_idx: Starting index (2500 for second half)
        num_papers: Number of papers to process from start_idx
    """
    # Load partial results if exists
    try:
        df = pd.read_csv('/content/drive/MyDrive/CSCE670 DATASET/labeled_papers_partial_2500_5000.csv')
        print("Resuming from partial progress...")
    except:
        df = dataframe.copy()
        if 'label' not in df.columns:
            df['label'] = None

    model = configure_gemini(api_key)

    # Calculate processing range
    end_idx = min(start_idx + num_papers, len(df))
    print(f"Processing papers {start_idx} to {end_idx-1}...")

    processed_count = 0
    for idx in tqdm(range(start_idx, end_idx), total=end_idx-start_idx):
        if pd.isna(df.at[idx, 'label']):
            label = generate_paper_label(model, df.at[idx, 'title'], df.at[idx, 'abstract'])
            df.at[idx, 'label'] = label
            processed_count += 1

            # Save progress periodically
            if processed_count % save_interval == 0:
                df.to_csv('/content/drive/MyDrive/CSCE670 DATASET/labeled_papers_partial_2500_5000.csv', index=False)
                print(f"\nSaved progress after {processed_count} papers")

    # Final save PATH TO YOUR SAVING DATASETS
    output_path = '/content/drive/MyDrive/CSCE670 DATASET/labeled_papers_2500_5000_final.csv'
    df.to_csv(output_path, index=False)
    return df


In [25]:
def relabel_unlabeled_papers(input_csv_path, output_csv_path, api_key):
    """
    Processes only unlabeled papers in an existing CSV
    Args:
        input_csv_path: Path to your partially labeled CSV
        output_csv_path: Where to save results (can be same as input)
        api_key: Your Gemini API key
    """
    # Load existing data
    df = pd.read_csv(input_csv_path)

    # Initialize model
    model = configure_gemini(api_key)

  # Only look at rows from 2500 onwards
    df_subset = df.iloc[2500:]

    # Find unlabeled rows within that subset
    unlabeled = df_subset['label'].isna()

    print(f"Found {unlabeled.sum()} unlabeled papers to process from row 2500 onward...")

    for idx in tqdm(df_subset[unlabeled].index, total=unlabeled.sum()):
        try:
            label = generate_paper_label(
                model,
                df.at[idx, 'title'],
                df.at[idx, 'abstract']
            )
            df.at[idx, 'label'] = label

            if idx % 50 == 0:
                df.to_csv(output_csv_path, index=False)
        except Exception as e:
            print(f"Critical error at index {idx}: {str(e)[:200]}")
            break


    # Final save
    df.to_csv(output_csv_path, index=False)
    return df

# Usage:
if __name__ == "__main__":
    # Load your API key
    GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')

    # Process unlabeled papers
    relabel_unlabeled_papers(
        input_csv_path='/content/drive/MyDrive/CSCE670 DATASET/labeled_papers_2500_5000_final5.csv',
        output_csv_path='/content/drive/MyDrive/CSCE670 DATASET/labeled_papers_2500_5000_final6.csv',
        api_key=GEMINI_API_KEY
    )

Found 4 unlabeled papers to process from row 2500 onward...


100%|██████████| 4/4 [00:01<00:00,  2.42it/s]


In [None]:

# if __name__ == "__main__":
#     # Load full dataset
#     ## PATH TO YOUR DATASET 5000 PAPERS CSV
#     df = pd.read_csv('/content/drive/MyDrive/CSCE670 DATASET/5000_papers.csv')

#     # Get API key
#     try:
#         GEMINI_API_KEY = userdata.get('GOOGLE_API_KEY')
#     except Exception as e:
#         raise ValueError("Please set up your Gemini API key in Colab secrets first") from e

#     # Process second half (2500-5000)
#     labeled_df = relabel_unlabeled_papers(df, GEMINI_API_KEY, start_idx=2500)

#     # Stats
#     success = labeled_df.iloc[2500:]['label'].notna().sum()
#     print(f"\nCompleted! Successfully labeled {success}/2500 papers in second half")
#     print("Sample results from second half:")
#     print(labeled_df.iloc[2500:2505][['title', 'label']])

In [23]:
df1 = pd.read_csv('/content/drive/MyDrive/CSCE670 DATASET/labeled_papers_first_2500_COMPLETED_2.csv')
df2 = pd.read_csv('/content/drive/MyDrive/CSCE670 DATASET/labeled_papers_2500_5000_final6.csv')

final_df = df1.copy()

# Update the 'label' column from df2 for rows 2500 onward
final_df.loc[2500:, 'label'] = df2.loc[2500:, 'label']

final_df.to_csv('/content/drive/MyDrive/CSCE670 DATASET/all_labels.csv', index=False)

# for label in df_subset['label']:
#     print(label)

In [24]:
unlabeled = final_df['label'].isna()

print(f"Found {unlabeled.sum()} unlabeled papers")

Found 29 unlabeled papers
