In [None]:
import re
import gc
from multiprocessing import Pool, cpu_count

from sentence_transformers import SentenceTransformer

import pandas as pd
import swifter 

from tqdm import tqdm  

In [None]:
file_path = "QC23_Dataset.csv"  # Path to your input CSV
output_parquet_path = "embedded_output.parquet"  # Output file path
chunk_size = 10000
num_students = 25000

In [None]:
def preprocessing_text(s):
    if not isinstance(s, str):
        return ''
    s = s.lower() 
    s = re.sub(r'[\\s+]', ' ', s)
    s = re.sub(r'[.,;!?]', '', s)
    return s

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')

In [None]:
%%time
collected_students = set()
collecting = True
all_chunks = []
chunk_index = 0;
no_data_chunk_counter = 0
max_no_data_chunks = 1000;

with tqdm(desc="Processing chunks") as pbar:
    for chunk in pd.read_csv(
        file_path,
        usecols=['student_user_id', 'problem_id', 'problem_type_id', 'skill_code', 'answer_text', 'correctness', 'problem_log_id', 'problem_start_time', 'problem_end_time', 'hint_count', 'problem_body', 'teacher_id', 'first_action_is_attempt'],
        chunksize=chunk_size,
        low_memory=True,
        on_bad_lines='skip'
    ):
        try:
            chunk = chunk.dropna(subset=['skill_code'])
            chunk['ID'] = list(range(chunk_index * chunk_size, chunk_index * chunk_size + len(chunk)))
            # Only collect new students if we haven't reached the target
            if collecting:
                new_students_mask = ~chunk['student_user_id'].isin(collected_students)
                new_students = chunk.loc[new_students_mask, 'student_user_id'].unique()
                needed = num_students - len(collected_students)
                if needed > 0:
                    selected_new = new_students[:needed]
                    collected_students.update(selected_new)
                if len(collected_students) >= num_students:
                    collecting = False
            
            # Filter
            chunk = chunk[chunk['student_user_id'].isin(collected_students)]
            if chunk.empty:
                no_data_chunk_counter += 1
                if no_data_chunk_counter >= max_no_data_chunks:
                    print(f"No relevant data found in {max_no_data_chunks} consecutive chunks. Stopping early.")
                    break
                pbar.update(1)
                continue
            
            # Preprocess
            chunk['answer_text'] = chunk['answer_text'].swifter.apply(preprocessing_text)
            unique_texts = chunk['answer_text'].drop_duplicates().tolist()
    
            # Embed
            embeddings = model.encode(chunk['answer_text'].tolist(), show_progress_bar=True)
            embeddings_body = model.encode(chunk['problem_body'].tolist(), show_progress_bar=True)
    
            embedded_chunk = pd.DataFrame({
                "ID": chunk['ID'].values,
                "problem_log_id": chunk['problem_log_id'].values,
                "teacher_id": chunk['teacher_id'].values,
                "problem_id": chunk['problem_id'].values,
                "problem_type_id": chunk['problem_type_id'].values,
                "first_action_is_attempt": chunk['first_action_is_attempt'].values,
                "hint_count": chunk['hint_count'].values,
                "answer_text": chunk['answer_text'].values,
                "problem_body": chunk['problem_body'].values,
                "student_user_id": chunk['student_user_id'].values,
                "skill_code": chunk['skill_code'].values,
                "correctness": chunk['correctness'].values,
                "problem_start_time": chunk['problem_start_time'].values,
                "problem_end_time": chunk['problem_end_time'].values,
                "embedding_response": list(embeddings),
                "embedding_problembody": list(embeddings_body),
            })
    
            all_chunks.append(embedded_chunk)
        except Exception as e:
            print(f"\nSkipping chunk {chunk_index} due to error: {str(e)}")
        del chunk, embeddings, embedded_chunk
        gc.collect()
        chunk_index += 1
        pbar.update(1)

final_df = pd.concat(all_chunks, ignore_index=True)
final_df.to_parquet(
    output_parquet_path,
    index=False,
    engine='pyarrow',
    compression='snappy'
)

final_df = pd.concat(all_chunks, ignore_index=True)
final_df.drop(columns=['answer_text', 'problem_body']).to_parquet(
    output_parquet_path.replace('.parquet', '--no-text.parquet'),
    index=False,
    engine='pyarrow',
    compression='snappy'
)