In [None]:
import os
import glob
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
def analyze_lecture_importance(base_data_path: str, lecture_number: int, top_n: int = 15):
    """
    Performs inter-note TF-IDF analysis for a single lecture to find the most 
    commonly discussed and unique terms.

    Args:
        base_data_path: The root directory for the data (e.g., 'data/').
        lecture_number: The specific lecture folder to analyze (e.g., 1 for 'lecture-1').
        top_n: The number of most important terms to extract.
    """
    
   
    lecture_folder = os.path.join(base_data_path, f'lecture-{lecture_number}')
    print(f"--- Analyzing Notes in: {lecture_folder} ---")

    # 2. Load and Clean the Data
    note_files = glob.glob(os.path.join(lecture_folder, '*.md'))
    student_notes = []
    
    if not note_files:
        print("No student notes found for this lecture.")
        return

    for file_path in note_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            raw_text = f.read()
            
            # Basic Cleaning: Remove Markdown, lowercasing
            # Adapt the cleaning based on how messy your student notes are:
            cleaned_text = raw_text.lower()
            cleaned_text = re.sub(r'[#*>-]', ' ', cleaned_text)  # Remove common MD symbols
            cleaned_text = re.sub(r'[^a-z\s]', '', cleaned_text) # Keep only letters and spaces
            
            student_notes.append(cleaned_text)

    # 3. TF-IDF Calculation
    # We use a CountVectorizer inside TfidfVectorizer
    vectorizer = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 3),  # Look for single words and two-word phrases
        min_df=1             # Term must appear in at least 2 different notes
    )

    tfidf_matrix = vectorizer.fit_transform(student_notes)
    
    # 4. Extract and Rank Terms by Mean Score
    feature_names = vectorizer.get_feature_names_out()
    
    # Calculate the average TF-IDF score for each term across all student notes
    mean_tfidf_scores = np.mean(tfidf_matrix.toarray(), axis=0)

    # Get the indices of the highest scores (descending order)
    sorted_indices = mean_tfidf_scores.argsort()[::-1]
    
    top_terms_scores = [
        (feature_names[i], mean_tfidf_scores[i])
        for i in sorted_indices[:top_n]
    ]
    
    return top_terms_scores

# --- Example Usage ---
# Assuming your folders are set up like data/lecture-1/, data/lecture-2/, etc.
# analysis_results_L1 = analyze_lecture_importance(base_data_path='data', lecture_number=1)
# analysis_results_L2 = analyze_lecture_importance(base_data_path='data', lecture_number=2)

In [11]:
analyze_lecture_importance("data\generative-ai" ,lecture_number=6, top_n=10)

--- Analyzing Notes in: data\generative-ai\lecture-6 ---


  analyze_lecture_importance("data\generative-ai" ,lecture_number=6, top_n=10)


[('tsub', 0.19003764663023748),
 ('thetasub', 0.17003368382705455),
 ('reward', 0.13450541246492845),
 ('gradient', 0.12418464339057514),
 ('model', 0.118670289710606),
 ('value', 0.1099516591010474),
 ('policy', 0.09732081327631441),
 ('rank', 0.09677308174620392),
 ('pi', 0.08148569052199195),
 ('function', 0.07436919837722808)]

#### Words are individually not the optimal, we can try something like bag of words or n-grams
