## Step 1: Create a Google Drive Service

In [21]:
import os
from google.oauth2.credentials import Credentials
from googleapiclient.discovery import build

In [22]:
def create_service(token_file_name, api_service_name, api_version, scopes):

    working_dir = os.getcwd()
    token_dir   = 'token_files'
    creds = Credentials.from_authorized_user_file(os.path.join(working_dir, token_dir, token_file_name), scopes)
    
    try:
        service = build(api_service_name, api_version, credentials=creds, static_discovery=False)
        print(api_service_name, api_version, 'service created successfully')
        return service
    except Exception as e:
        print(e)
        print(f'Failed to create service instance for {api_service_name}')
        os.remove(os.path.join(working_dir, token_dir, token_file_name))
        return None

To create a Google Drive service, you need to have the following files:
- token file for Google Drive

If you don't have it, please create one using the `create_token_files.ipynb` routine.

In [23]:
gdrive_service = create_service(
    token_file_name  = 'token_drive_v3_t1.json', 
    api_service_name = 'drive', 
    api_version      = 'v3', 
    scopes           = ['https://www.googleapis.com/auth/drive']
)

drive v3 service created successfully


In [24]:
buckets = {
    'recordings'  : '1z2nxlaCc6QPY9yIwUMIz_58-2DagVSJc',
    'transcripts' : '14c_oSCVHEkSBb59-68qJlJmSu9-WYqfO',
    'feedback'    : '1FKJ5dez8TJAoSIiCtVR6P1rYTrwSE0TT'
}

In [25]:
def check_date_dirs(selected_date, parent_id):
    query= f"parents = '{parent_id}'"
    current_dates = gdrive_service.files().list(supportsAllDrives=True, includeItemsFromAllDrives=True, q=query).execute()
    if selected_date.replace(' ', '-') not in [x['name'] for x in current_dates['files']]:
        print('No directory found for the selected date')
        return True
    else:
        raise Exception('Directory is already present!!!')

In [26]:
def create_date_subfolder(selected_date, parent_id):

    if check_date_dirs(selected_date, parent_id):
        folder_name = selected_date.replace(' ', '-')
        file_metadata = {
            'name': folder_name,
            'mimeType': 'application/vnd.google-apps.folder',
            'parents': [f'{parent_id}']
        }
        gdrive_service.files().create(body=file_metadata, supportsAllDrives=True, fields='id').execute()

In [27]:
def get_dir_id(selected_date, parent_id):

    query= f"parents = '{parent_id}'"
    date_fmt = selected_date.replace(' ', '-')

    folder_elements = (
        gdrive_service
        .files()
        .list(supportsAllDrives=True, includeItemsFromAllDrives=True, q=query)
        .execute()
    )
    dates_dict = {x['name'] : x['id'] for x in folder_elements['files']}

    if date_fmt in [k for k,_ in dates_dict.items()]:
        return dates_dict[date_fmt]
    else:
        raise Exception('Selected date not found in bucket. Please create the directory first')

In [28]:
def check_if_empty(parent_id):

    query= f"parents = '{parent_id}'"
    folder_elements = (
        gdrive_service
        .files()
        .list(supportsAllDrives=True, includeItemsFromAllDrives=True, q=query)
        .execute()
    )
    if folder_elements['files']:
        return False
    else:
        return True

## Step 2: Access and download the SurveyCTO data

To access the data, you need the following information:

1. SurveyCTO username
2. SurveyCTO server name
3. SurveyCTO password

Create an environmental file with this information

In [29]:
import pandas as pd
from io import StringIO
from pysurveycto import SurveyCTOObject
from dotenv import load_dotenv

load_dotenv()
scto = SurveyCTOObject(
    server_name = os.getenv("SCTO_server"), 
    username    = os.getenv("SCTO_user"), 
    password    = os.getenv("SCTO_password")
)
form_data = scto.get_form_data(
    form_id     = 'llamadas', 
    format      = 'csv'
)
SCTO_data = pd.read_csv(StringIO(form_data))
SCTO_data['date_short'] = pd.to_datetime(SCTO_data['starttime']).dt.strftime('%B %d')

In [30]:
SCTO_data

Unnamed: 0,SubmissionDate,starttime,endtime,deviceid,devicephonenum,username,device_info,duration,caseid,id_estudiante,nombre,idioma,llamada,instanceID,formdef_version,review_quality,review_status,KEY,date_short
0,"Feb 14, 2025 3:06:31 AM","Feb 14, 2025 3:04:57 AM","Feb 14, 2025 3:06:19 AM",9761fdb37964719f,,laurita.buttner@gmail.com,Redmi|23117RA68G|14|SurveyCTO Collect 2.81.4 (...,82,,,Prueba de tutoría,,https://tutoriastel.surveycto.com/view/submiss...,uuid:775a057e-9673-4c0a-bb4a-9c1336c5554d,2502051553,,APPROVED,uuid:775a057e-9673-4c0a-bb4a-9c1336c5554d,February 14
1,"Feb 25, 2025 9:23:37 PM","Feb 25, 2025 9:21:52 PM","Feb 25, 2025 9:23:31 PM",d492488efa7ca428,,yorkespin@gmail.com,samsung|SM-A155M|14|SurveyCTO Collect 2.81.4 (...,99,,PY12345,ESTUDIANTE DE PRUEBA,español,https://tutoriastel.surveycto.com/view/submiss...,uuid:10c4650a-f9c5-4b31-a6a5-5cd5c989161c,2502251134,,APPROVED,uuid:10c4650a-f9c5-4b31-a6a5-5cd5c989161c,February 25
2,"Feb 25, 2025 10:05:21 PM","Feb 25, 2025 9:43:39 PM","Feb 25, 2025 10:05:13 PM",14c2db1d12aafb4b,,abareiro,Redmi|23106RN0DA|14|SurveyCTO Collect 2.81.4 (...,1280,,PY13352,GONZALEZ ACOSTA MISAEL JAVIER,español,https://tutoriastel.surveycto.com/view/submiss...,uuid:4836a771-a89c-4a14-9328-1c37a3457c65,2502251134,,APPROVED,uuid:4836a771-a89c-4a14-9328-1c37a3457c65,February 25
3,"Feb 28, 2025 8:18:49 PM","Feb 28, 2025 8:17:37 PM","Feb 28, 2025 8:18:43 PM",088248b5d497c6d1,,cobregon,samsung|SM-A235M|14|SurveyCTO Collect 2.81.4 (...,66,,PY12759,ANDINO VERA JUDITH CONCEPCION,español,https://tutoriastel.surveycto.com/view/submiss...,uuid:5211a5ab-9971-432e-89e6-078143bd1555,2502251134,,APPROVED,uuid:5211a5ab-9971-432e-89e6-078143bd1555,February 28
4,"Feb 28, 2025 8:59:23 PM","Feb 28, 2025 8:57:10 PM","Feb 28, 2025 8:59:09 PM",b72323fbf55e914f,,lmayans,Multilaser|M7_WIFI|11|SurveyCTO Collect 2.81.4...,120,,PY13170,VAZQUEZ LUANA JAZMIN,español,https://tutoriastel.surveycto.com/view/submiss...,uuid:d610aa21-aace-476b-9272-5326d2c0e92f,2502251134,,APPROVED,uuid:d610aa21-aace-476b-9272-5326d2c0e92f,February 28
5,"Mar 2, 2025 12:24:26 AM","Feb 25, 2025 9:42:45 PM","Mar 2, 2025 12:23:32 AM",e9831274f03d3a74,,lariel,Redmi|2303CRA44A|14|SurveyCTO Collect 2.81.4 (...,124,,PY12672,LEZCANO VERA JAZMIN ABIGAIL,español,https://tutoriastel.surveycto.com/view/submiss...,uuid:0385c229-4785-4661-a51b-1aa0fad8cf1d,2502251134,,APPROVED,uuid:0385c229-4785-4661-a51b-1aa0fad8cf1d,February 25


In [10]:
available_dates = SCTO_data['date_short'].unique()
print(available_dates)

['February 14' 'February 25' 'February 28']


## Step 3: Delimit your cases by date

**!!!Select one available date from the list above!!!**

In [32]:
selected_date = 'February 28'
filtered_data = SCTO_data[SCTO_data['date_short'] == selected_date]
filtered_data

Unnamed: 0,SubmissionDate,starttime,endtime,deviceid,devicephonenum,username,device_info,duration,caseid,id_estudiante,nombre,idioma,llamada,instanceID,formdef_version,review_quality,review_status,KEY,date_short
3,"Feb 28, 2025 8:18:49 PM","Feb 28, 2025 8:17:37 PM","Feb 28, 2025 8:18:43 PM",088248b5d497c6d1,,cobregon,samsung|SM-A235M|14|SurveyCTO Collect 2.81.4 (...,66,,PY12759,ANDINO VERA JUDITH CONCEPCION,español,https://tutoriastel.surveycto.com/view/submiss...,uuid:5211a5ab-9971-432e-89e6-078143bd1555,2502251134,,APPROVED,uuid:5211a5ab-9971-432e-89e6-078143bd1555,February 28
4,"Feb 28, 2025 8:59:23 PM","Feb 28, 2025 8:57:10 PM","Feb 28, 2025 8:59:09 PM",b72323fbf55e914f,,lmayans,Multilaser|M7_WIFI|11|SurveyCTO Collect 2.81.4...,120,,PY13170,VAZQUEZ LUANA JAZMIN,español,https://tutoriastel.surveycto.com/view/submiss...,uuid:d610aa21-aace-476b-9272-5326d2c0e92f,2502251134,,APPROVED,uuid:d610aa21-aace-476b-9272-5326d2c0e92f,February 28


Play an audio sample -> The first recording from your filtered data

In [33]:
import ffmpeg
import tempfile

sample_url = filtered_data['llamada'].iloc[0]
sample_audio_bytes = scto.get_attachment(sample_url)

# Save bytes to a temp file
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
    temp_audio.write(sample_audio_bytes)
    temp_sample_audio_path = temp_audio.name

# Play audio using ffplay (part of ffmpeg)
os.system(f'ffplay -nodisp -autoexit {temp_sample_audio_path}')


ffplay version 7.1 Copyright (c) 2003-2024 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1_4 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

31488

## Step 4: Process the recordings

Processing the recordings will:

1. Upload the recording to Google Drive
2. Send the recording to the AssemblyAI API

In [26]:
# Use the Google Drive service to create a subfolder
try:
    create_date_subfolder(selected_date, buckets['recordings'])
except Exception as e:
    print(e)
try:
    create_date_subfolder(selected_date, buckets['transcripts'])
except Exception as e:
    print(e)

Directory is already present!!!
Directory is already present!!!


In [27]:
from googleapiclient.http import MediaFileUpload
import assemblyai as aai

aai.settings.api_key = os.getenv("aai_key")

aai_config = aai.TranscriptionConfig(
    speech_model      = aai.SpeechModel.best,
    speaker_labels    = True,
    # speakers_expected = n_speakers,
    language_code     = "es"
)

transcriber = aai.Transcriber()

In [28]:
def process_recordings(selected_date, force = False):

    dir_id_recs = get_dir_id(selected_date, buckets['recordings'])
    dir_id_tras = get_dir_id(selected_date, buckets['transcripts'])

    if (check_if_empty(dir_id_recs) and check_if_empty(dir_id_tras)) or force:
        
        results = []

        for i, row in filtered_data.iterrows():
            print(f'Processing recording {i+1} of {len(filtered_data)}')
            audio_url   = row['llamada']
            audio_bytes = scto.get_attachment(audio_url)
            audio_name  = f'{row['username']}_{row['id_estudiante']}.mp3'

            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
                temp_audio.write(audio_bytes)
                temp_audio_path = temp_audio.name

            # Part 1: uploading recording to Google Drive
            audio_metadata = {
                'name': audio_name,
                'parents': [f'{dir_id_recs}']
            }
            
            media = MediaFileUpload(temp_audio_path, mimetype='audio/mpeg')
            gdrive_service.files().create(
                body       = audio_metadata, 
                media_body = media, 
                fields     = 'id', 
                supportsAllDrives = True
            ).execute()
            print(f'File {audio_name} uploaded successfully')

            # Part 2: sending recording to AssemblyAI for transcription
            transcript = transcriber.transcribe(
                temp_audio_path,
                config = aai_config
            )
            if transcript.status == aai.TranscriptStatus.completed:
                print(f"Transcription was completed successfully!")
            if transcript.status == aai.TranscriptStatus.error:
                print(f"Transcription failed: {transcript.error}")

            transcript_uts  = [f"Speaker {utterance.speaker}: {utterance.text}" for utterance in transcript.utterances]
            full_transcript = "\n".join(transcript_uts)
            transcript_name = f'{row['username']}_{row['id_estudiante']}.txt'

            results.append({
                'username'      : row['username'],
                'id_estudiante' : row['id_estudiante'],
                'transcript'    : full_transcript
            })

            with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_transcript:
                temp_transcript.write(full_transcript.encode('utf-8'))
                temp_transcript_path = temp_transcript.name

            transcript_metadata = {
                'name'     : transcript_name,
                'parents'  : [f'{dir_id_tras}']
            }

            tfile = MediaFileUpload(temp_transcript_path, mimetype='text/plain')
            gdrive_service.files().create(
                body       = transcript_metadata, 
                media_body = tfile, 
                fields     = 'id', 
                supportsAllDrives=True
            ).execute()
            print(f'File {transcript_name} uploaded successfully')

            os.remove(temp_audio_path)
            os.remove(temp_transcript_path)

            print(f'======================================================================')

        return results

    else:
        raise Exception('Target directory is not empty. Make sure you are targeting the correct folder ID')

**!!!THERE IS RESTRICTION IN PLACE WERE!!!**

- The code will first check if the target folder in Google drive IS EMPTY or NOT EMPTY... If the target folder is **NOT EMPTY**, the code will raise an exception.
- If you want to **bypass** this behavior, pass `force=True`in the code bellow.

In [31]:
# Processing recordings... make sure to have the correct date selected
transcripts_list = process_recordings(selected_date, force=True)

Processing recording 2 of 2
File yorkespin@gmail.com_PY12345.mp3 uploaded successfully
Transcription was completed successfully!
File yorkespin@gmail.com_PY12345.txt uploaded successfully
Processing recording 3 of 2
File abareiro_PY13352.mp3 uploaded successfully
Transcription was completed successfully!
File abareiro_PY13352.txt uploaded successfully


## Step 5: Pass the transcript to an OpenAI endpoint

In [None]:
def gen_system_prompt():
    
    prompt = """
    Eres un experto consultor en educación especializado en tutoría y métodos de enseñanza efectivos. Recibirás transcripciones de sesiones 
    de tutoría en matemáticas entre un tutor y un estudiante de primaria (cuarto a sexto grado). Tu tarea es analizar la metodología de 
    enseñanza del tutor y proporcionar recomendaciones prácticas para mejorar su desempeño.

    Tu análisis debe incluir:

    1. Evaluación de técnicas de explicación
        
        - Analiza la claridad, estructura y precisión de las explicaciones.
        - Evalúa el uso de ejemplos, analogías y estrategias didácticas.
        - Determina si las explicaciones están adaptadas al nivel de comprensión del estudiante.
    
    2. Evaluación del compromiso y la eficacia pedagógica

        - Observa cómo el tutor involucra al estudiante y verifica su comprensión.
        - Identifica si usa métodos interactivos o reflexivos para reforzar el aprendizaje.
    
    3. Retroalimentación constructiva

        - Proporciona sugerencias específicas y accionables para mejorar su enseñanza.
        - Considera prácticas como simplificar conceptos, usar ejemplos relevantes, resumir puntos clave y fomentar la participación.
        - Asegúrate de que la retroalimentación sea objetiva, alentadora y orientada al crecimiento profesional del tutor.
    
    4. Consideraciones y contexto

        - Basa tu evaluación en la transcripción proporcionada y en buenas prácticas pedagógicas.
        - Mantén un tono claro, breve y amable, ya que el tutor leerá directamente las recomendaciones.
    
    Tu respuesta debe consistir exclusivamente en un conjunto de recomendaciones derivadas de tu evaluación.
    """

    return prompt

In [None]:
def gen_context_prompt(full_transcript):
   
    prompt = f"""
    La siguiente es una transcripción de una sesión de tutoría de matemáticas entre un tutor y un estudiante de cuarto a sexto grado de 
    primaria:

    [INICIO DE LA TRANSCRIPCIÓN]

    {full_transcript}

    [FIN DE LA TRANSCRIPCIÓN]

    La transcripción puede contener errores y omisiones, especialmente en intervenciones cortas. No te enfoques en errores gramaticales o 
    tipográficos. Además, la transcripción separa las intervenciones por interlocutor, pero no especifica quién es el tutor y quién es el 
    estudiante. Deberás inferirlo según el contenido.

    INSTRUCCIONES:

    Con base en la transcripción, evalúa las técnicas de explicación, el compromiso y la eficacia pedagógica del tutor. Luego, elabora un 
    conjunto estructurado de recomendaciones breves y específicas para mejorar su enseñanza.

    Asegúrate de que tu retroalimentación sea:

    - Objetiva, alentadora y enfocada en el crecimiento profesional del tutor.
    - Limitada a 800 caracteres.
    - Escrita en formato de lista con viñetas, sin títulos ni encabezados.
    - Directa y clara, ya que será leída directamente por el tutor.

    ASPECTOS A EVALUAR:

    - Estructura de la tutoría: ¿Hay una introducción clara y un cierre efectivo?
    - Interacción: ¿El estudiante participa activamente? ¿El tutor verifica su comprensión? ¿Cómo responde el tutor a las preguntas y dudas?
    - Técnicas pedagógicas: ¿Se usan ejemplos relevantes y estrategias efectivas?
    - Manejo del tiempo: ¿El tutor ajusta el ritmo y aprovecha los 30 minutos disponibles?
    - Aspectos específicos: Claridad de explicaciones, detección de confusiones, motivación del estudiante, uso de preguntas guía.

    CONSIDERACIONES ADICIONALES:

    - Concéntrate en la forma en que el tutor enseña, no en modificar el contenido de la guía de ejercicios.
    - Evita resaltar aspectos positivos, salvo para contrastarlos con áreas de mejora.
    - Recuerda que el estudiante tiene entre 8 y 10 años, por lo que las explicaciones deben ser adecuadas a su nivel.
    - Enfoca tu set de recomendaciones EXCLUSIVAMENTE en los aspectos a mejorar y en las acciones específicas para lograrlo.
    - Procura ser amable en tus recomendaciones.

    Gracias por tu valiosa contribución a la mejora de la calidad educativa.
    """

    return prompt

In [35]:
# Use the Google Drive service to create a subfolder
try:
    create_date_subfolder(selected_date, buckets['feedback'])
except Exception as e:
    print(e)

Directory is already present!!!


In [36]:
from openai import OpenAI

def get_feedback(selected_date, transcripts, force = False):

    dir_id_feed = get_dir_id(selected_date, buckets['feedback'])

    if check_if_empty(dir_id_feed) or force:
        
        client = OpenAI(
            api_key = os.getenv("OPENAI_API_KEY")
        )

        for item in transcripts:

            print(f'Processing recording {item['username']}_{item['id_estudiante']}')

            history = [
                {"role": "system", "content": gen_system_prompt()},
                {"role": "user",   "content": gen_context_prompt(item['transcript'])}
            ]

            chat_completion = client.chat.completions.create(
                messages = history,
                model    = "gpt-4o-2024-08-06"
            )

            feedback = chat_completion.choices[0].message.content
            feedback_name = f'{item['username']}_{item['id_estudiante']}.txt'

            with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_feedback:
                temp_feedback.write(feedback.encode('utf-8'))
                temp_feedback_path = temp_feedback.name

            feedback_metadata = {
                'name'     : feedback_name,
                'parents'  : [f'{dir_id_feed}']
            }

            tfile = MediaFileUpload(temp_feedback_path, mimetype='text/plain')
            gdrive_service.files().create(
                body       = feedback_metadata, 
                media_body = tfile, 
                fields     = 'id', 
                supportsAllDrives=True
            ).execute()
            print(f'File {feedback_name} uploaded successfully')

            os.remove(temp_feedback_path)

            print(f'======================================================================')

    else:
        raise Exception('Target directory is not empty. Make sure you are targeting the correct folder ID')

**!!!THERE IS RESTRICTION IN PLACE WERE!!!**

- The code will first check if the target folder in Google drive IS EMPTY or NOT EMPTY... If the target folder is **NOT EMPTY**, the code will raise an exception.
- If you want to **bypass** this behavior, pass `force=True`in the code bellow.

In [38]:
# Processing transcripts... make sure to have the correct date selected
get_feedback(selected_date, transcripts_list, force=True)

Processing recording yorkespin@gmail.com_PY12345
File yorkespin@gmail.com_PY12345.txt uploaded successfully
Processing recording abareiro_PY13352
File abareiro_PY13352.txt uploaded successfully
