# Data Preprocessing

## Imports

In [0]:
import pandas as pd
from skimage import io,color
from matplotlib import pyplot as plt
import cv2
import os
import numpy as np

## Connect to GDrive

In [2]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


## Core Code

In [0]:
# Resources path
base_filepath = 'My Drive/SoccerAI/train_resources'
class_description_file = 'class-descriptions-boxable.csv'
labeled_bbox_file = 'train-annotations-bbox.csv'
train_image_file = 'train-images-boxable-with-rotation.csv'

In [0]:

# Classes df lists the Classes available
classes_df = pd.read_csv(base_filepath+'/'+class_description_file,header=None,names=['id','label'])

# labeled bbox df contains the labeled data : IMG ID + BBox coordinates
labeled_bbox_df = pd.read_csv(base_filepath+'/'+labeled_bbox_file)

# Train image df contains images + Metadata (Author, URL, Title ...)
train_image_df = pd.read_csv(base_filepath+'/'+train_image_file)

In [5]:
# We are just interested in Football & Person classes
classes_of_interest = ['Person','Football']
classes_label = {}
for class_of_interest in classes_of_interest:
  class_label = classes_df[classes_df['label']==class_of_interest]['id'].values[0]
  classes_label.update({class_of_interest:class_label})

print(classes_label)

revert_dict = {}
for k,v in classes_label.items():
  revert_dict.update({v:k})

{'Person': '/m/01g317', 'Football': '/m/01226z'}


In [0]:
# We only consider 1000 images of each class for the moment
classes_df_dict = {}
for class_of_interest in classes_of_interest:
  bbox_df = labeled_bbox_df[labeled_bbox_df['LabelName']==classes_label.get(class_of_interest)]
  img_id = bbox_df['ImageID'].unique()[0:1000]
  img_df = train_image_df[train_image_df['ImageID'].isin(img_id)].sample(frac=1).reset_index(drop=True)
  classes_df_dict.update({class_of_interest:img_df}) 

Index(['ImageID', 'Subset', 'OriginalURL', 'OriginalLandingURL', 'License',
       'AuthorProfileURL', 'Author', 'Title', 'OriginalSize', 'OriginalMD5',
       'Thumbnail300KURL', 'Rotation'],
      dtype='object')


In [0]:
# Only executes this part once
# Let's download the img

for class_of_interest in classes_of_interest:
  os.mkdir(base_filepath+'/'+class_of_interest)
  for index,row in (classes_df_dict.get(class_of_interest)).iterrows():
      url = row['OriginalURL']
      try:
          img = io.imread(url)
          io.imsave(base_filepath+'/'+class_of_interest+'/'+row["ImageID"]+".jpg",img)
      except:
          pass


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


In [0]:
# Let's gather all the Data in one DF
train_img_per_class={}
for class_of_interest in classes_of_interest:
  image_id = []
  for image in os.listdir(base_filepath+'/'+class_of_interest+'/'):
      image_id.append(image[:-4])
  train_bbox_df = labeled_bbox_df[labeled_bbox_df['ImageID'].isin(image_id) 
                & labeled_bbox_df['LabelName'].isin([classes_label.get(class_of_interest)])]
  
  train_img_per_class.update({class_of_interest:train_bbox_df})

train_img_df = pd.DataFrame()
for class_of_interest in classes_of_interest:
  train_img_df = train_img_df.append(train_img_per_class.get(class_of_interest))


In [0]:
# Now we must split between train & test df
test_id = []
for class_of_interest in classes_of_interest:
  unique_id = train_img_per_class.get(class_of_interest)['ImageID'].unique()
  test_id +=  unique_id.tolist()[0:int(0.2*len(unique_id))]

test_df = train_img_df[train_img_df['ImageID'].isin(test_id)]
train_df = train_img_df[~train_img_df['ImageID'].isin(test_id)]

In [8]:
train_annotation =  open(base_filepath + "/train_annotation.txt","w+")
for index,row in train_df.iterrows():
  class_name = revert_dict.get(row['LabelName'])
  filename = base_filepath+'/'+class_name+'/'+row['ImageID']+'.jpg'
  img = io.imread(filename)
  try:
    height, width, _ = img.shape
  except:
    img = color.gray2rgb(img)
    height, width, _ = img.shape
    io.imsave(filename,img)
  x1 = row['XMin']
  x2 = row['XMax']
  y1 = row['YMin']
  y2 = row['YMax']
  train_annotation.write(row['ImageID'] + ',' + filename + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + class_name + '\n')
train_annotation.close()

test_annotation =  open(base_filepath + "/test_annotation.txt","w+")
for index,row in test_df.iterrows():
  class_name = revert_dict.get(row['LabelName'])
  filename = base_filepath+'/'+class_name+'/'+row['ImageID']+'.jpg'
  img = io.imread(filename)
  try:
    height, width, _ = img.shape
  except:
    img = color.gray2rgb(img)
    height, width, _ = img.shape
    io.imsave(filename,img)
  x1 = row['XMin']
  x2 = row['XMax']
  y1 = row['YMin']
  y2 = row['YMax']
  test_annotation.write(row['ImageID'] + ',' + filename + ',' + str(x1) + ',' + str(y1) + ',' + str(x2) + ',' + str(y2) + ',' + class_name + '\n')
test_annotation.close()

  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
