In [1]:
import json
import pandas as pd
import numpy as np

## 1. Create Dataframe

Create dataframes from training, validation, and test json files.

In [2]:
trainDataJson = json.load((open('../data/annotations/train.json')))
valDataJson = json.load((open('../data/annotations/val.json')))
testDataJson = json.load((open('../data/annotations/test.json')))

### 1.1. Training Dataset

In [3]:
train_images = trainDataJson['images']
train_images_df = pd.DataFrame(train_images)
train_images_df.head()

Unnamed: 0,file_name,vizwiz_url,id,text_detected
0,VizWiz_train_00000000.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,0,True
1,VizWiz_train_00000001.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,1,True
2,VizWiz_train_00000002.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,2,True
3,VizWiz_train_00000003.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,3,True
4,VizWiz_train_00000004.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,4,True


In [4]:
train_annots = trainDataJson['annotations']
train_annots_df = pd.DataFrame(train_annots)
train_annots_df.head()

Unnamed: 0,caption,image_id,is_precanned,is_rejected,id,text_detected
0,ITS IS A BASIL LEAVES CONTAINER ITS CONTAINS T...,0,False,False,0,True
1,A green and white plastic condiment bottle con...,0,False,False,1,True
2,Quality issues are too severe to recognize vis...,0,True,True,2,True
3,A bottle of spices in a plastic container layi...,0,False,False,3,True
4,some basil leaves in a container on a counter,0,False,False,4,True


### 1.2. Validation Dataset

In [5]:
val_images = valDataJson['images']
val_images_df = pd.DataFrame(val_images)
val_images_df.head()

Unnamed: 0,file_name,vizwiz_url,id,text_detected
0,VizWiz_val_00000000.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,23431,True
1,VizWiz_val_00000001.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,23432,True
2,VizWiz_val_00000002.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,23433,True
3,VizWiz_val_00000003.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,23434,True
4,VizWiz_val_00000004.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,23435,True


In [6]:
val_annots = valDataJson['annotations']
val_annots_df = pd.DataFrame(val_annots)
val_annots_df.head()

Unnamed: 0,caption,image_id,is_precanned,is_rejected,id,text_detected
0,A computer screen shows a repair prompt on the...,23431,False,False,117155,True
1,a computer screen with a repair automatically ...,23431,False,False,117156,True
2,partial computer screen showing the need of re...,23431,False,False,117157,True
3,Part of a computer monitor showing a computer ...,23431,False,False,117158,True
4,The top of a laptop with a blue background and...,23431,False,False,117159,True


### 1.3. Test Dataset

In [7]:
test_images = testDataJson['images']
test_images_df = pd.DataFrame(test_images)
test_images_df.head()

Unnamed: 0,file_name,vizwiz_url,id,text_detected
0,VizWiz_test_00000000.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,31181,True
1,VizWiz_test_00000001.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,31182,False
2,VizWiz_test_00000002.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,31183,True
3,VizWiz_test_00000003.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,31184,True
4,VizWiz_test_00000004.jpg,https://ivc.ischool.utexas.edu/VizWiz_visualiz...,31185,True


### <i> Summary </i>

In [8]:
tr_imgs_info = ["Training Images", train_images_df.shape[0], train_images_df.columns.values] 
val_imgs_info = ["Validation Images", val_images_df.shape[0], val_images_df.columns.values]
te_imgs_info = ["Test Images", test_images_df.shape[0], test_images_df.columns.values]

tr_annots_info = ["Training Captions", train_annots_df.shape[0], train_annots_df.columns.values]
val_annots_info = ["Validation Captions", val_annots_df.shape[0], val_annots_df.columns.values]
te_annots_info = ["Test Captions", 0, np.nan]

data = [tr_imgs_info, val_imgs_info, te_imgs_info, tr_annots_info, val_annots_info, te_annots_info]
pd.DataFrame(data, columns=["Dataset", "Count", "Attributes"])

Unnamed: 0,Dataset,Count,Attributes
0,Training Images,23431,"[file_name, vizwiz_url, id, text_detected]"
1,Validation Images,7750,"[file_name, vizwiz_url, id, text_detected]"
2,Test Images,8000,"[file_name, vizwiz_url, id, text_detected]"
3,Training Captions,117155,"[caption, image_id, is_precanned, is_rejected,..."
4,Validation Captions,38750,"[caption, image_id, is_precanned, is_rejected,..."
5,Test Captions,0,


Training dataset contains 23,431 images with 117,155 captions. Validation dataset contains 7,750 images with 38,750 captions. Test dataset contains 8,000 images.

Images dataset for training, validation, and test have 4 attributes as follows: 
- file_name: Name of the image file
- vizwiz_url: URL of the image file
- id: Identifier of the image file
- text_detected: Boolean value to represent the presence of text in the image

Annotations dataset for training and validation have 6 attributes as follows:

- caption: Caption of the image.
- image_id: Identifier of the image, which is associated with the caption.
- is_precanned: 
- is_rejected:


## 2. Merge Dataframes

Create a single dataframe that combines the information of images and its annotations.

In [9]:
tr_imgs_df = train_images_df.rename(columns={'id': 'image_id'}, inplace=False)
tr_captions_df = tr_imgs_df.merge(train_annots_df, how='inner', on='image_id')
tr_captions_df.shape

(117155, 9)

## 3. Filter Data

The quality of images from the dataset are categorized as follows:

<b> 1) No quality flaws </b>

Image content is recognizable for sighted people to caption the image.

<b> 2) Quality flaws </b>

The image content is not fully recognizable but some information can be retrieved from it. The categories of quality flaws are as follows:

- Blur: Is the image blurry?
- Bright: Is the image too bright?
- Dark: Is the image too dark?
- Obscured: Is the scene obscured by the photographer?
- Framing: Are parts of necessary items missing from the image?
- Rotation: Does the image need to be rotated for proper viewing?

<b> 3) Not captionable </b>

The quality issues are too severe to recognize visual content.

## 3.1 Images with no quality flaws

In [10]:
tr_captionable_df = tr_captions_df[~tr_captions_df.caption.str.contains('Quality issues are too severe', case=False)]
tr_captionable_df.query('caption = "A green bag of potato chips from the store"')

ValueError: cannot assign without a target object

## 3.2 Poor quality images for caption generation

In [None]:
tr_not_captionable_df = tr_captions_df[tr_captions_df.caption.str.contains('Quality issues are too severe', case=False)]
tr_not_captionable_df.shape

## 4. Find total captions per image

In [None]:
tr_grouped_captions = tr_captions_df.groupby(['image_id'])

In [None]:
print(f'Total number of training images: {len(tr_grouped_captions.groups)}')
print(f'Training captions per image: {tr_grouped_captions.groups.size()}')

In [None]:
txt_detected_df = grouped_captions['text_detected']
txt_detected_df.head()

## Analyze Data

In [None]:
new_df = captions_df.reset_index()
unique_images_count = new_df['image_id'].unique()
print(f'There are {len(unique_images_count)} images.')

#### Total number of unique images

In [None]:
attr_count_df = grouped_captions.count()
type(attr_count_df)

#### Total number of captions for each image

## Write Data to CSV

In [None]:
captions_df.to_csv(r'captions.csv', index=True)

## Data Visualization

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
df = captions_df.text_detected.value_counts()
df.plot.barh()
plt.title('How many images have text in it?')
plt.xlabel("No. of images")
plt.ylabel("Contains text")
plt.show()
;