In [24]:
import base64
from io import BytesIO
import pandas as pd
import re
import pyarrow.parquet as pq
import pyarrow as pa
from PIL import Image

# Function to convert choice format
def convert_choice_format(choices):
    if pd.isna(choices):  # Handle NaN values
        return []
    choice_map = {
        'B': 1,
        'A': 0,
        'C': 2,
        'D': 3,
        'E': 4,
        'F': 5,
        'G': 6
    }
    return [choice_map.get(choice, "") for choice in str(choices) if choice in choice_map]
import ast
# Function to ensure each entry is a list containing one string
def wrap_in_list(entry):
    if pd.isna(entry):  # Handle NaN values
        return []
    return ast.literal_eval(entry)

# Read CSV file
csv_file = 'data/extracted_questions_answers_200_301.csv'
df = pd.read_csv(csv_file)

# Remove rows where choices is '[]'
df = df[df['Answers'] != '[]']

# Create new columns with required names and formats
df['question'] = df['Question']
df['choices'] = df['Answers'].apply(wrap_in_list)
df['answer'] = df['Correct Answer'].apply(convert_choice_format)
df['image'] = df['Image'].astype(str)
df['Exam'] = 'CCNA-200-301'

# Remove rows where choices is '[]'
df = df[df['choices'] != '[]']

# Select required columns for the parquet file
df = df[['question', 'choices', 'answer', 'image', 'Exam']]

# Save to parquet file

#drop all rows where image is "nan"
df = df[df['image'] != 'nan']

df = df.drop_duplicates(subset=['image'],ignore_index=True)


#drop image that the length is less than 6000
df = df[df['image'].apply(lambda x: len(x) > 6000)]

df = df


df = df[df['question'].str.contains('exhibit', case=False)]

# Function to decode a base64 image and get its dimensions
def get_image_size(base64_string):
    image_data = base64.b64decode(base64_string)
    image = Image.open(BytesIO(image_data))
    #display(image)
    return image.size  # Returns a tuple (width, height)

# Apply the function to the 'image' column
df['width'], df['height'] = zip(*df['image'].apply(get_image_size))

# Calculating statistics
width_stats = df['width'].describe()
height_stats = df['height'].describe()

print("Width Statistics:")
print(width_stats)
print("\nHeight Statistics:")
print(height_stats)



table = pa.Table.from_pandas(df)
parquet_file_path = './data/200_301_CCNA_images.parquet'
pq.write_table(table, parquet_file_path)
print(f"Parquet file {parquet_file_path} created successfully.")

Width Statistics:
count      70.000000
mean      629.814286
std       183.542776
min       322.000000
25%       486.500000
50%       599.000000
75%       763.250000
max      1000.000000
Name: width, dtype: float64

Height Statistics:
count     70.000000
mean     385.414286
std      176.060307
min       91.000000
25%      261.000000
50%      373.000000
75%      492.750000
max      867.000000
Name: height, dtype: float64
Parquet file ./data/200_301_CCNA_images.parquet created successfully.
