In [27]:
import base64
from io import BytesIO
import pandas as pd
import re
import pyarrow.parquet as pq
import pyarrow as pa
from PIL import Image

# Function to convert choice format
def convert_choice_format(choices):
    if pd.isna(choices):  # Handle NaN values
        return []
    choice_map = {
        'B': 1,
        'A': 0,
        'C': 2,
        'D': 3,
        'E': 4,
        'F': 5,
        'G': 6
    }
    return [choice_map.get(choice, "") for choice in str(choices) if choice in choice_map]
import ast
# Function to ensure each entry is a list containing one string
def wrap_in_list(entry):
    if pd.isna(entry):  # Handle NaN values
        return []
    return ast.literal_eval(entry)

# Read CSV file
csv_file = 'data/extracted_questions_answers_350_701.csv'#'data/extracted_questions_answers_200_301.csv'
df = pd.read_csv(csv_file)

# Remove rows where choices is '[]'
df = df[df['Answers'] != '[]']

# Create new columns with required names and formats
df['question'] = df['Question']
df['choices'] = df['Answers'].apply(wrap_in_list)
df['answer'] = df['Correct Answer'].apply(convert_choice_format)
df['image'] = df['Image'].astype(str)
df['Exam'] = 'CCNA-200-301'

# Remove rows where choices is '[]'
df = df[df['choices'] != '[]']

# Select required columns for the parquet file
df = df[['question', 'choices', 'answer', 'image', 'Exam']]

# Save to parquet file

#drop all rows where image is "nan"
df = df[df['image'] != 'nan']

df = df.drop_duplicates(subset=['image'],ignore_index=True)


#drop image that the length is less than 6000
df = df[df['image'].apply(lambda x: len(x) > 6000)]

df = df[df['question'].str.contains('exhibit', case=False)]

# Function to decode a base64 image and get its dimensions
def change_image_format(base64_string):
    image_data = base64.b64decode(base64_string)
    image = Image.open(BytesIO(image_data))
    #display(image)
    #get encoding

    #change jpeg to png
    if image.format != 'PNG':
        # Step 1: Decode the base64-encoded JPEG data
        jpeg_data = base64.b64decode(base64_string)

        # Step 2: Load the JPEG data into a PIL Image
        jpeg_image = Image.open(BytesIO(jpeg_data))

        # Step 3: Save the image in PNG format to a BytesIO object
        png_buffer = BytesIO()
        jpeg_image.save(png_buffer, format="PNG")

        # Step 4 (optional): Encode the PNG data to base64 if needed
        base64_string = base64.b64encode(png_buffer.getvalue()).decode('utf-8')

    return base64_string # Returns a tuple (width, height)

# Apply the function to the 'image' column
df['image']= df['image'].apply(change_image_format)
table = pa.Table.from_pandas(df)
parquet_file_path = './data/200_301_CCNA_images.parquet'
#pq.write_table(table, parquet_file_path)
print(f"Parquet file {parquet_file_path} created successfully.")
df['choices'][0]


Parquet file ./data/200_301_CCNA_images.parquet created successfully.


['create an SNMP pull mechanism for managing AMP',
 'gather network telemetry information from AMP for endpoints',
 'get the process and PID information from the computers in the network',
 'gather the network interface information about the computers AMP sees']