In [41]:
# Load the data into a pandas dataframe:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
# roomifAI_dir = '/Users/aqborromeo/Documents/MTECH-AI/scripts/NUS/NUS-ISS/Modules/PRS/PracticeModule/PRS-PM-2023-07-01-GRP5-roomifAI/SystemCode/data_processing'
roomifAI_dir = '/content/drive/MyDrive/MTECH_IS_Project/SEM02'



In [56]:
!pip install gitpython



In [57]:
# importing libary
import zipfile
import os
import shutil
import requests
import git
import re
import pandas as pd
import logging

In [58]:
# Define the git repo url and the destination directory
git_repo_url = "https://github.com/valexande/IKEA-Dataset.git" # Change this to your git repo url
dest_dir = roomifAI_dir + "/dataset" # Change this to your destination directory
log_dir = roomifAI_dir + "/logs" # Change this to your prepared log directory

# Create Directory if don't exist
os.makedirs(dest_dir, exist_ok=True)
print(dest_dir)


# Configure logging
log_file = log_dir + '/get_dataset.log'
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')

# Create Directory if don't exist
os.makedirs(log_dir, exist_ok=True)
print(log_dir)


/content/drive/MyDrive/MTECH_IS_Project/SEM02/dataset
/content/drive/MyDrive/MTECH_IS_Project/SEM02/logs


In [59]:
# Define a function that will download and unzip a git repo
def download_git_repo(url, dest):
    # Create a temporary directory to store the downloaded zip file
    temp_dir = roomifAI_dir + "/temp/ikea_dir" # Change this to your temporary directory
    temp_ikea_dir = os.path.abspath(temp_dir)

    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    elif not os.path.exists(dest_dir):
        os.makedirs(dest_dir)
    elif not os.path.exists(log_dir):
        os.makedirs(log_dir)
    else:
        shutil.rmtree(temp_dir) # Delete the temporary directory and its contents
        shutil.rmtree(dest_dir) # Delete the temporary directory and its contents
        os.makedirs(temp_dir) # Re-creating a empty directory
        os.makedirs(dest_dir) # Re-creating a empty directory

    print(dest_dir)
    file_name = url.split("/")[-1] # Get the file name from the url
    zip_path = os.path.join(temp_dir, file_name) # Get the full path of the zip file
    response = git.Git(temp_dir).clone(url) # Download the git repo  from the url

    return(temp_ikea_dir)

In [60]:
# Define the function that will unzip the zip files
def unzip_ikeads(url, get_temp_ikea_dir):
  temp_dir = get_temp_ikea_dir # Get the value of the temp directory from previous function
  file_name = url.split("/")[-1] # Get the file name from the url
  print(file_name)
  zip_path = os.path.join(temp_dir, file_name) # Get the full path of the zip file
  print(zip_path)
  # print (zip_path) # this include ".git" ikeatemp/IKEA-Dataset.git
  temp_ds_path = zip_path.split(".")[0] # spliting to get the relative path and directory name
  print(temp_ds_path)
  # Get the list of files in the source directory
  files = os.listdir(temp_ds_path)
  print(files)

  # Loop through the files and unzip them if they are zip files
  for file in files:
    if file.endswith(".zip"): # Check if the file is a zip file
      file_path = os.path.join(temp_ds_path, file) # Get the full path of the file
      zip_file = zipfile.ZipFile(file_path) # Create a ZipFile object
      zip_file.extractall(temp_dir) # Extract all the contents to the destination directory
      list_des = os.listdir(temp_dir)
      zip_file.close() # Close the ZipFile object

  return temp_dir


In [61]:
# Function to get all jpg file paths in a directory and its subdirectories
def list_jpg_files(get_temp_ikea_dir):
    jpg_files = []
    for root, dirs, files in os.walk(get_temp_ikea_dir):
        for file in files:
            if file.lower().endswith(".jpg"):
                jpg_files.append(os.path.join(root, file))
    return jpg_files # This is the list to call when you want to ge the list of path


# Function that will put the list of path and add them in a dataframe and process them
def df_jpg_files(get_temp_ikea_dir):
    # Get the list of jpg file paths
    jpg_file_paths = list_jpg_files(get_temp_ikea_dir)

    # Create a DataFrame and process the paths
    jpg_df = pd.DataFrame({'FilePath': jpg_file_paths})

    # Replace backslashes with forward slashes, convert to lowercase, and replace spaces with underscores
    jpg_df['FilePath'] = jpg_df['FilePath'].apply(lambda x: x.replace("/", ",").lower().replace(" ", "_"))

    # Split the 'FilePath' column into separate columns using a comma as delimiter
    jpg_df['Columns'] = jpg_df['FilePath'].str.split(',')

    # Determine the maximum number of path parts across all rows
    max_path_parts = jpg_df['Columns'].apply(len).max()

    # Create separate columns for each path part
    for i in range(max_path_parts):
      jpg_df[f'Column_{i+1}'] = jpg_df['Columns'].apply(lambda x: x[i] if len(x) > i else None)

    # Drop the intermediate 'PathParts' column
    jpg_df = jpg_df.drop(columns=['Columns','FilePath','Column_1','Column_2','Column_3','Column_4','Column_5','Column_6','Column_7','Column_8'])

    # print(jpg_df)
    return jpg_file_paths

# Function to get all txt file paths in a directory and its subdirectories
def list_txt_files(get_temp_ikea_dir):
    txt_files = []
    for root, dirs, files in os.walk(get_temp_ikea_dir):
        for file in files:
            if file.lower().endswith(".txt"):
                txt_files.append(os.path.join(root, file))
    return txt_files # This is the list to call when you want to ge the list of path

# Function that will put the list of path and add them in a dataframe and process them
def df_txt_files(get_temp_ikea_dir):
    # Get the list of jpg file paths
    txt_file_paths = list_txt_files(get_temp_ikea_dir)

    # Create a DataFrame and process the paths
    txt_df = pd.DataFrame({'FilePath': txt_file_paths})

    # Replace backslashes with forward slashes, convert to lowercase, and replace spaces with underscores
    txt_df['FilePath'] = txt_df['FilePath'].apply(lambda x: x.replace("/", ",").lower().replace(" ", "_"))

    # # Split the 'FilePath' column into separate columns using a comma as delimiter
    # txt_df[['Column1', 'Column2', 'Column3', 'Column4', 'Column5']] = txt_df['FilePath'].str.split('/', expand=True)

    # Split the 'FilePath' column into separate columns using a comma as delimiter
    txt_df['Columns'] = txt_df['FilePath'].str.split(',')

    # Determine the maximum number of path parts across all rows
    max_path_parts = txt_df['Columns'].apply(len).max()

    # Create separate columns for each path part
    for i in range(max_path_parts):
        txt_df[f'Column_{i+1}'] = txt_df['Columns'].apply(lambda x: x[i] if len(x) > i else None)

    # Drop the intermediate 'PathParts' column
    txt_df = txt_df.drop(columns=['Columns','FilePath','Column_1','Column_2','Column_3','Column_4','Column_5','Column_6','Column_7','Column_8'])

    # print(txt_df)
    return txt_file_paths

In [66]:
# Function to move jpg files based on the source file path to a centralize location

# Function to move files based on the source file path
def move_files(jpg_path_list, dest_dir):

    # Destination Create the directory where all consolidated jpg files be located
    destination_dir = dest_dir + "/ikeads_all_images"

    # Create Directory if don't exist
    os.makedirs(destination_dir, exist_ok=True)
    print(destination_dir)

    # Create a DataFrame and process the paths
    source_df = pd.DataFrame({'FilePath': jpg_path_list})


    # df = pd.read_csv('your_dataframe.csv')  # Replace with your DataFrame source

    for index, row in source_df.iterrows():
        source_file = row['FilePath']  # Replace with the actual column name containing the source file paths
        if not os.path.exists(source_file):
            logging.error(f"Source file not found: {source_file}")
            continue

        destination_file = os.path.join(destination_dir, os.path.basename(source_file))
        try:
            shutil.move(source_file, destination_file)
            logging.info(f"Moved file: {source_file} to {destination_file}")
        except Exception as e:
            logging.error(f"Error moving file: {source_file} - {str(e)}")

    return destination_dir


In [63]:
# Function that will cleanup unused files
def housekeeping(temp_ikea_dir):
    shutil.rmtree(temp_ikea_dir) # Delete the temporary directory and its contents
    shutil.rmtree(dest_dir) # Decom dataset directory and its contents NOTE: ONLY ENABLE THIS WHEN YOU DONT NEED THE DS ANYMORE!

In [67]:

def main():

    # Call the first function with the git repo url and destination directory as arguments
    get_temp_ikea_dir = download_git_repo(git_repo_url, dest_dir)

    # # Call function to unzip
    unzip_ikea_files = unzip_ikeads(git_repo_url, get_temp_ikea_dir)

    # Call Function which will get the jpg and txt and
    jpg_path_list = df_jpg_files(unzip_ikea_files)
    txt_path_list = df_txt_files(unzip_ikea_files)

    # # Call the function what will consolidate the all files in one directory
    ikeads_all_images = move_files(jpg_path_list, dest_dir)

    count = len([name for name in os.listdir(ikeads_all_images) if os.path.isfile(os.path.join(ikeads_all_images, name))])
    print('Total Number of images:', count)
    img_list = os.listdir(ikeads_all_images)
    print('Images Name:',img_list)

    # # Call the function which will clean up temporary files
    # housekeeping(get_temp_ikea_dir)

    # Close the logging handler
    logging.shutdown()


if __name__ == '__main__':
   main()


/content/drive/MyDrive/MTECH_IS_Project/SEM02/dataset
IKEA-Dataset.git
/content/drive/MyDrive/MTECH_IS_Project/SEM02/temp/ikea_dir/IKEA-Dataset.git
/content/drive/MyDrive/MTECH_IS_Project/SEM02/temp/ikea_dir/IKEA-Dataset
['.git', 'Baby Children.zip', 'Bathroom.zip', 'Bedroom.zip', 'Dining Room.zip', 'Hallway.zip', 'HomeOffice.zip', 'IKEARequest.py', 'Kitchen 1.zip', 'Kitchen 2.zip', 'Laundry.zip', 'Living Room 1.zip', 'Living Room2.zip', 'README.md']
/content/drive/MyDrive/MTECH_IS_Project/SEM02/dataset/ikeads_all_images
Total Number of images: 10686
Images Name: ['00319526.jpg', '00421411.jpg', '20213883.jpg', '20421293.jpg', '30178006.jpg', '30208348.jpg', '30421297.jpg', '30421301.jpg', '40084861.jpg', '40199287.jpg', '40414140.jpg', '40426925.jpg', '60137571.jpg', '60368726.jpg', '60426934.jpg', '70137575.jpg', '80307223.jpg', '80421332.jpg', '00364076.jpg', '00440235.jpg', '10319719.jpg', '20364037.jpg', '27470460.jpg', '40427109.jpg', '70373039.jpg', '80421252.jpg', '00028508.jpg