In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, MWETokenizer  # Multi-Word Expression tokenizer
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Define keywords and their related reasons for inclusion
keywords_to_reasons = {
    "psychosocial": "Relevant to psychosocial factors.",
    "mental health": "Pertains to mental health issues.",
    "LGBTQ+": "Directly involves LGBTQ+ topics.",
    "rural": "Focuses on rural settings.",
    "urban": "Related to urban contexts.",
    "Atlanta": "Specific to the Atlanta area.",
    "transtheoretical model": "Involves the transtheoretical model of change.",
    "stages of change": "Addresses stages of change in health behavior.",
    "health services": "Concerns health service utilization.",
    "interventions": "Discusses mental health interventions.",
    "culturally competent": "Mentions culturally competent approaches.",
    "SGMA": "Related to sexual and gender minority adolescents."
}

# Load the dataset
file_path = '/Users/jeremyfeagan/Library/Mobile Documents/com~apple~CloudDocs/Education/Science:Health:Math/HSCI3000 Intervention Paper Ref.xlsx'
df = pd.read_excel(file_path)

# Initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """Preprocess text by tokenizing, removing stopwords, lemmatizing, and handling multi-word expressions."""
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tagged_tokens = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v' if tag.startswith('V') else 'n') 
                         for token, tag in tagged_tokens if token not in stop_words and token.isalpha()]
    return lemmatized_tokens

def determine_relevance_and_reason(row):
    """Determine the relevance of each row based on keywords and provide a reason for inclusion or exclusion."""
    abstract_note = row["Abstract Note"] if pd.notnull(row["Abstract Note"]) else ""
    processed_text = preprocess_text(abstract_note)
    
    # Track the occurrence of keywords to decide on inclusion/exclusion
    keyword_hits = Counter({keyword: processed_text.count(keyword) for keyword in keywords_to_reasons.keys()})
    
    include_keywords = {k: v for k, v in keyword_hits.items() if v > 0}
    if include_keywords:
        return "Include", "Include: " + "; ".join([keywords_to_reasons[k] for k in include_keywords.keys()])
    else:
        return "Maybe", "Exclude: Does not match specified keywords; further review needed."

# Enhance tag generation to include classification outcome
def generate_tags(title, classification):
    """Generate cohesive and uniform tags for each title, including the classification outcome."""
    processed_title = preprocess_text(title)
    generated_tags = [classification]  # Include the classification as a tag
    
    # Add keyword-based tags
    for word in processed_title:
        if word in keywords_to_reasons:
            generated_tags.append(word)
    
    return '; '.join(set(generated_tags))  # Use set to avoid duplicate tags

# Apply the adjusted functions
df[["Include in Research Paper", "Include/Exclude Reason"]] = df.apply(
    lambda row: pd.Series(determine_relevance_and_reason(row)), axis=1)
df['Generated Tags'] = df.apply(lambda row: generate_tags(row['Title'], row['Include in Research Paper']), axis=1)

# Display the results
print(df[['Title', 'Include in Research Paper', 'Include/Exclude Reason', 'Generated Tags']].head())

# Save the DataFrame to a CSV file
output_file_path = '/Users/jeremyfeagan/Library/Mobile Documents/com~apple~CloudDocs/Education/Science:Health:Math/final_classified_references.csv'
df.to_csv(output_file_path, index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeremyfeagan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeremyfeagan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jeremyfeagan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeremyfeagan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                               Title  \
0  Frequencies and patterns of adverse childhood ...   
1  "Automatic assumption of your gender, sexualit...   
2  A retrospective, phenomenological study of fam...   
3     Achieving health equity for LGBTQ+ adolescents   
4  Adolescent experiences of sexting: A systemati...   

  Include in Research Paper  \
0                   Include   
1                   Include   
2                   Include   
3                     Maybe   
4                     Maybe   

                              Include/Exclude Reason Generated Tags  
0                Include: Focuses on rural settings.        Include  
1  Include: Focuses on rural settings.; Related t...        Include  
2         Include: Relevant to psychosocial factors.        Include  
3  Exclude: Does not match specified keywords; fu...          Maybe  
4  Exclude: Does not match specified keywords; fu...          Maybe  
