# Libraries

In [None]:
import os
import shutil
import pandas as pd
import numpy as np
from lxml import etree
import xml.etree.cElementTree as ET
from zipfile import ZipFile

# Extracting zip file

In [None]:
# specifying the zip file name
file_name = "dataset.zip"

# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
	# printing all the contents of the zip file
	zip.printdir()

	# extracting all the files
	print('Extracting all the files now...')
	zip.extractall()
	print('Done!')

# Bifurcate into Train and Test images

In [None]:
paths = ['./dataset/images/Train', './dataset/images/Test']
for path in paths:
  if not os.path.exists(path):
    os.mkdir(path)

In [None]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

print('Training file contains :',len(train_df.image_path.unique()),'images')
print('Testing file contains :',len(test_df.image_path.unique()),'images')

Training file contains : 7874 images
Testing file contains : 2092 images


In [None]:
try:
  for image_path in train_df.image_path:
    shutil.move('./dataset/images/'+image_path, './dataset/images/Train/'+image_path)

  for image_path in test_df.image_path:
    shutil.move('./dataset/images/'+image_path, './dataset/images/Test/'+image_path)
except:
  pass

# Converted into PASCAL VOC Format

In [None]:
path = './dataset/images/Train_xml'
if not os.path.exists(path):
  os.mkdir(path)

In [None]:
row=0
height = 1080
width = 1920
depth = 3

try:
  while row < train_df.shape[0]:
    curr_row = row
    annotation = ET.Element('annotation')
    ET.SubElement(annotation, 'folder').text = 'images'
    ET.SubElement(annotation, 'filename').text = train_df.image_path.iloc[curr_row]
    ET.SubElement(annotation, 'segmented').text = '0'
    size = ET.SubElement(annotation, 'size')
    ET.SubElement(size, 'width').text = str(width)
    ET.SubElement(size, 'height').text = str(height)
    ET.SubElement(size, 'depth').text = str(depth)

    while row < train_df.shape[0] and train_df.image_path.iloc[curr_row] == train_df.image_path.iloc[row]:
      ob = ET.SubElement(annotation, 'object')
      ET.SubElement(ob, 'name').text = train_df.name.iloc[row]
      ET.SubElement(ob, 'pose').text = 'Unspecified'
      ET.SubElement(ob, 'truncated').text = '0'
      ET.SubElement(ob, 'difficult').text = '0'
      bbox = ET.SubElement(ob, 'bndbox')
      ET.SubElement(bbox, 'xmin').text = str(train_df.xmin.iloc[row]*2)
      ET.SubElement(bbox, 'ymin').text = str(train_df.ymin.iloc[row]*2)
      ET.SubElement(bbox, 'xmax').text = str(train_df.xmax.iloc[row]*2)
      ET.SubElement(bbox, 'ymax').text = str(train_df.ymax.iloc[row]*2)
      row+=1

    fileName = train_df.image_path.iloc[curr_row].split('.')[0]
    tree = ET.ElementTree(annotation)
    tree.write('./dataset/images/Train_xml/'+fileName + ".xml", encoding='utf8')
except Exception as e:
  print(e)

# Uploading data to roboflow

In [None]:
!pip install requests-toolbelt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting requests-toolbelt
  Downloading requests_toolbelt-0.10.1-py2.py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.5/54.5 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: requests-toolbelt
Successfully installed requests-toolbelt-0.10.1


In [None]:
import requests
import base64
import io
from PIL import Image
from requests_toolbelt.multipart.encoder import MultipartEncoder

In [None]:
count=0
for filename in train_df.image_path.unique():
  try:
    # Load Image with PIL
    image = Image.open("./dataset/images/Train/"+filename).convert("RGB")

    # Convert to JPEG Buffer
    buffered = io.BytesIO()
    image.save(buffered, quality=90, format="JPEG")

    # Base 64 Encode
    img_str = base64.b64encode(buffered.getvalue())
    img_str = img_str.decode("ascii")

    # Construct the URL
    upload_url = "".join([
        "https://api.roboflow.com/dataset/theme1-de2c9/upload",
        "?api_key=nn8rUC8y9gssyhBYXTln",
        "&name="+filename,
        "&split=train"
    ])

    # POST to the API
    r = requests.post(upload_url, data=img_str, headers={
        "Content-Type": "application/x-www-form-urlencoded"
    })

    #annotations upload
    img_id = r.json()['id']

    annotation_filename = filename.split('.')[0]+'.xml'

    # Read Annotation as String
    annotation_str = open("./dataset/images/Train_xml/"+annotation_filename, "r").read()

    # Construct the URL
    upload_url = "".join([
        "https://api.roboflow.com/dataset/theme1-de2c9/annotate/" + img_id,
        "?api_key=nn8rUC8y9gssyhBYXTln",
        "&name=", annotation_filename
    ])

    # POST to the API
    r = requests.post(upload_url, data=annotation_str, headers={
        "Content-Type": "text/plain"
    })

  except Exception as e:
    print(e, 'filename : ', filename)

  count+=1
  if count%500==0:
    print(count)