In [1]:
# Global

import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import shutil
from natsort import natsort_keygen
import datetime as dt
from IPython.display import clear_output
import time

# Custom
from file_manager import File_Manager
from df_manager import DF_Manager

In [2]:

PUBLISHER_LIST = ['Abstract Studio',
                 'Archaia',
                 'Archie Comics',
                 'Aspen MLT',
                 'Avatar Press',
                 'Boom! Studios',
                 'Dark Horse Comics',
                 'DC Comics',
                 'Dynamite Entertainment',
                 'IDW Publishing',
                 'Image',
                 'Marvel',
                 'Top Cow',
                 'Vertigo',
                 'Wildstorm',
                 'Zenescope Entertainment']

PL_LOWER = [p.lower().replace("!", "") for p in PUBLISHER_LIST]

# Parse Files in HOME_PATH

In [3]:
HOME_PATH = "D:\-=_Comics_=-\__New Stuff"
#####################################################################
# Get File list from folder to be deleted

tree = []

# Creates a list of all subfolders that contain files
for (path, dirs, files) in os.walk(HOME_PATH, topdown=True):
    if 'Thumbs.db' in files:
        files.remove('Thumbs.db')
        os.remove(f"{path}\\Thumbs.db")
    if 'covers.db' in files:
        files.remove('covers.db')
        os.remove(f"{path}\\covers.db")
    if len(files) > 0:
        tree.append((path,files))

In [4]:
###################################################################
# Open comic file  and retrieve metadata
# File Check to make sure they are readable and no errors

FILE_MANAGER = File_Manager()
ERROR_LOG = []

total_folders = len(tree)
current_folder = 0
time_start = time.time()

for folder in tree:
    directory = folder[0]
    files = folder[1]
    total_files = len(files)
    current_file = 0

    for file in files:
        time_now = str(dt.timedelta(seconds = (time.time() - time_start)))

        print(f"Progress:  {current_folder}/{total_folders} {(current_folder/total_folders)*100:.2f}%  -  {time_now}")
        print(f"{directory}  -  {current_file}/{total_files} {(current_file/total_files)*100:.2f}%")
        print(file)
        clear_output(wait=True)

        full_path = f"{directory}\{file}"
        # print(full_path)
        if file.lower().endswith(".cbr") or file.lower().endswith(".cbz"):
            FILE_MANAGER.parse_file(full_path, file)
        else:
            print("Not CBR/CBZ - ", full_path)

        current_file += 1

    current_folder += 1

FILE_MANAGER.print_errors()

Number of import errors:    0


In [5]:
# Convert list of dictionaries to pandas dataframe for processing
df = pd.DataFrame(FILE_MANAGER.FILE_LIST)
# Single Folder Run
df.to_csv('file_list.csv')

## Import from csv (If working outside home)
# Import from last csv instead of reprocessing
# df = pd.read_csv('file_list.csv')



In [49]:
# Global Folder Run
# df.to_csv('comic_list.csv')
# df

# df_global['Year'] = df_global['Year'].astype(int)

# Data Check

In [6]:
df.Series = df.Series.str.strip()
# Checking Volume numbers
df.Volume.fillna(value=0, inplace=True)
# Fixing issue number caveats
df['Number'].replace('½', .5, inplace=True)
df['Number'].replace('1½', .5, inplace=True)
df['Number'].replace('∞', 999, inplace=True)
df['Number'].replace('Omega', 1, inplace=True)
# Fixes issue numbers like "10AU" or "25.BEY"
df['Number'] = df['Number'].astype(str)
for i in range(len(df['Number'])):
    n_string = df.iloc[i]['Number']

    l_int, l_str = "", ""
    for l in n_string:
        if not l.isalpha():
            l_int+=l
        elif l.isalpha():
            l_str+=l
    if len(l_str) > 0:
        if l_int[-1] == ".":
            l_int += "1"
        else:
            l_int += ".1"
    df.at[i,'Number'] = l_int


debug_day = pd.DataFrame
debug_year = pd.DataFrame
debug_number = pd.DataFrame
debug_volume = pd.DataFrame

print("======================Debug Check=======================")
try:
    df['Volume'] = df['Volume'].astype(int)
    print("Passed - Volumes to int")
except Exception as e:
    print("Failed - Volumes to int")
    print("Volume Error - ", e)

try:
    df['Number'] = df['Number'].astype(float)
    print("Passed - Numbers to float")
except Exception as e:
    print("Failed - Numbers to float")
    print('Number Error - ', e)

try:
    df['Year'] = df['Year'].astype(int)
    print("Passed - Years to int")
except Exception as e:
    print("Failed - Years to int")
    print('Year Error - ', e)

try:
    df['Month'] = df['Month'].astype(int)
    print("Passed - Months to int")
except Exception as e:
    print("Failed - Months to int")
    print('Month Error - ', e)

try:
    # df['Day'].replace(np.NaN, 1, inplace=True)
    df['Day'] = df['Day'].astype(int)
    print("Passed - Days to int")
except Exception as e:
    print("Failed - Days to int")
    print('Day Error - ', e)
    debug_day = df[df['Day'].isna()]
    if not debug_day.empty:
        print("========Days with NaN values===========")
        for row in debug_day.itertuples():
            print(row.Index, row.Series, row.Number, "  -  ", row.FilePath)

# Fixes capitalization errors in publisher names
publishers = df.Publisher.unique().tolist()
for p in publishers:
    p=p.lower().replace("!", "")
    if p in PL_LOWER:
        # find index
        p_index = PL_LOWER.index(p)
        # replace all occurances in df with index in publisher list
        df = df.replace(str(p), str(PUBLISHER_LIST[p_index]))
# Fix Publisher Names
df['Publisher'].replace("/", '&', regex=True, inplace=True)
df['Publisher'].replace("BOOM! Studios", 'Boom! Studios', regex=True, inplace=True)
# DC Imprints
df['Publisher'].replace('I.W. Publishing', 'DC Comics', inplace=True)
# Marvel Imprints
df['Publisher'].replace('Marvel Digital Comics Unlimited', 'Marvel', inplace=True)
df['Publisher'].replace('Marvel Knights', 'Marvel', inplace=True)
df['Publisher'].replace('Max', 'Marvel', inplace=True)
df['Publisher'].replace('Max Comics', 'Marvel', inplace=True)
df['Publisher'].replace('Timely', 'Marvel', inplace=True)
df['Publisher'].replace('Marvel Soleil', 'Marvel', inplace=True)
df['Publisher'].replace('Marvel UK', 'Marvel', inplace=True)
df['Publisher'].replace('Epic', 'Marvel', inplace=True)
df['Publisher'].replace('Scholastic Book Services', 'Marvel', inplace=True)
# Image Imprints
df['Publisher'].replace('Shadowline', 'Image', inplace=True)
df['Publisher'].replace('Skybound', 'Image', inplace=True)
publishers = df.Publisher.unique().tolist()
pub_big = []
pub_misc = []
for p in publishers:
    if p in PUBLISHER_LIST:
        pub_big.append(p)
    else:
        pub_misc.append(p)
print('=======Publisher List=========')
print("Big: ", [*pub_big])
print("Misc: ", [*pub_misc])

print('========Volume List========')
print(df['Volume'].unique().tolist())

debug_volume = df[df['Volume'] > 10]
if not debug_volume.empty:
    print("=========Files with problem Volumes=========")
    for row in debug_volume.itertuples():
        print(row.FilePath)

Passed - Volumes to int
Passed - Numbers to float
Passed - Years to int
Passed - Months to int
Passed - Days to int
Big:  ['DC Comics', 'Boom! Studios', 'Marvel', 'Dynamite Entertainment', 'Image']
Misc:  ['Vault Comics']
[0]


In [7]:
df_import = df
# Import permanent db and merge with imported for processing
df_global = pd.read_csv('comic_list.csv', index_col=0,
                        dtype = {'Publisher':str,
                                 'Series':str,
                                 'Volume':int,
                                 'Number':float,
                                 'Year':int,
                                 'Month':int,
                                 'Day':int,
                                 'FilePath':str,
                                 'NewPath':str})
df_global = pd.concat([df_import,df_global], ignore_index = True)
df_global.sort_values(
    by=['Publisher', 'Series','Volume', 'Number','Year', 'Month', 'Day'],
    ascending=[True, True, True, True, True, True,True],
    inplace=True,
    key=natsort_keygen()
    )
df_global.reset_index(drop=True, inplace=True)

# Debugging Area

In [None]:
df

In [None]:
df_global[df_global.Series.isin(df_import['Series'].unique().tolist())]

In [None]:
df_global[(df_global.Series == "Justice League of America Annual")]

In [35]:
df_global.at[5059, "FilePath"] = 'D:\\-=_Comics_=-\\DC Comics\\Action Comics (1938)\\Action Comics #467.cbz'

In [13]:
df_global = df_global.drop([2988,2989,2991,2992,2994,2995,2997,2998,3000,3001,3003,3004,3267,3268,3895,3896,3898,3899,3901,3902,3922,3923,3925,3926,12372,12373,23070,23071,24049,24050,30432,30433,34799,34800,34801,34802,34803,34804,34986,34987,34989,34990,35956,35957,38954,38955,38957,38958,46583,46584,47264,47265,55448,55449,55451,55452,55454,55455,57381,57382,59426,59427,63973,63974,63976,63977,63979,63980,65759,65760,65762,65763,71034,71035], axis='index')

In [10]:
number_publishers

for p in number_publishers:
    if p not in root_dict:
        print(p)

DC Comics


In [14]:
df_global.to_csv('comic_list.csv')

# Create Move Dictionary

In [8]:


# Dictionary is for sorting and debugging
root_dict = {}
total_publishers = df_import.Publisher.unique().tolist()
for p in total_publishers:
    root_dict[p] = {}

title_dict = {}

titles=df_import.Series.unique()

# For testing
# titles = ['Deadpool & The Mercs For Money']

for title in tqdm(titles):
    # For creating folder names.  Removing special characters
    safe_title = title.replace(":", " -")
    safe_title = safe_title.replace('/', '-')
    safe_title = safe_title.replace('?', '')
    safe_title = safe_title.replace('"', '')
    safe_title = safe_title.replace('*', '')

    title_dict[safe_title] = {}

    # Filter by series
    cond_series = (df_global['Series'] == title)
    df_by_series = df_global[cond_series]

    # Creates list of unique publishers
    number_publishers = df_by_series.Publisher.unique().tolist()
    # catch for series with same name, multiple publishers.
    for p in number_publishers:
        if p not in root_dict:
            root_dict[p]={}

    for publisher in number_publishers:

        # referencing inside df
        folder_dict = {}
        # Filter by publisher
        cond_publisher = (df_global['Publisher'] == publisher)
        df_by_publisher = df_global[cond_publisher & cond_series].copy()
        df_by_publisher.sort_values(by=['Year', 'Month', 'Day'], ascending=[True, True,True], inplace=True, key=natsort_keygen())

        # Sorting for smaller publishers
        if publisher not in PUBLISHER_LIST:
            publisher_path = f"Misc\\{publisher}"
        else:
            publisher_path = publisher

        # Check Publisher path for pre-existing folders
        target_path = f"D:\\-=_Comics_=-\\{publisher_path}\\"

        DF = DF_Manager(df_by_publisher)
        target_folders, two_1_list  = DF.return_target_folders()
        # print(target_folders)

        # Create new folders and create dictionary reference for Year
        for y in target_folders:
            if y[0] in (entry for entry in two_1_list):
                folder_index = f"{y[0]}-{y[1]}"
            else:
                folder_index = y[0]
            folder_dict[folder_index]=[]

        #Number of Volumes Check
        volumes = df_by_publisher['Volume'].unique()

        for volume in volumes:
            # Filter by volume
            cond_volume = (df_global['Volume'] == volume)
            df_by_volume = df_global[cond_volume & cond_publisher & cond_series].copy()
            df_by_volume.sort_values(by=['Year', 'Number'], ascending=[True, True], inplace=True, key=natsort_keygen())

            years = df_by_volume['Year'].unique()

            for year in years:
                cond_year = (df_global['Year'] == year)
                df_by_year = df_global[cond_year & cond_volume & cond_publisher & cond_series].copy()

                # Problem Checker
                for row in df_by_year.itertuples():
                    try:
                        issue_date = dt.datetime(row.Year, row.Month, row.Day)
                    except ValueError:
                        print(f"ValueError - {title} {row.Number} - {row.Year}-{row.Month}-{row.Day}")
                        break

                    issue_year = row.Year
                    issue_month = row.Month
                    issue_day = row.Day
                    issue_num = row.Number
                    save_year = 0
                    mod_0 = .85
                    pre_issues = (issue_num == 0 or issue_num == 1.5 or issue_num == .5 or issue_num == .1 or issue_num == -1)

                    # Sort issues by Year-Month-Day
                    for i in range(len(target_folders)):
                        # Check for two issue 1 in same year
                        if target_folders[i][0] in (entry for entry in two_1_list):
                            tf_index = f"{target_folders[i][0]}-{target_folders[i][1]}"
                            try:
                                tf_index_next = f"{target_folders[i+1][0]}-{target_folders[i+1][1]}"
                            except IndexError:
                                tf_index_next = "0-0"
                        else:
                            tf_index = target_folders[i][0]
                            try:
                                tf_index_next = target_folders[i+1][0]
                            except IndexError:
                                tf_index_next = 0

                        prev_date = dt.datetime(target_folders[i][0], target_folders[i][1], target_folders[i][2])
                        if i == len(target_folders)-1:
                            if pre_issues and (prev_date - issue_date).days < 365 * mod_0:
                                save_year = tf_index
                            elif issue_date >= prev_date:
                                save_year = tf_index
                        else:
                            # Standard sort
                            next_date = dt.datetime(target_folders[i+1][0], target_folders[i+1][1], target_folders[i+1][2])

                            if pre_issues and (next_date - issue_date).days < 365 * mod_0:
                                # bumps up save year to the next entry if it's released less than 10 months before issue 1
                                save_year = tf_index_next
                            elif pre_issues and (prev_date - issue_date).days < 365 * mod_0:
                                # bumps up save year to the next entry if it's released less than 10 months before issue 1
                                save_year = tf_index
                            elif prev_date <= issue_date < next_date:
                                save_year = tf_index

                    # Add to dictionary
                    try:
                        folder_dict[save_year].append(issue_num)
                    except KeyError:
                        print(f"Key Error - {row.Index} {title} {issue_num} - SaveYear {save_year} : IssueYear {issue_year} - {next_date} - {issue_date}")
                        break

                    # Saves the new correct path to df
                    move_path = f"D:\\-=_Comics_=-\\{publisher_path}\\{safe_title} ({save_year})\\"
                    df_global.at[row.Index, 'NewPath'] = move_path

        title_dict[safe_title].update(folder_dict)
        root_dict[publisher].update(title_dict)


  0%|          | 0/20 [00:00<?, ?it/s]

In [12]:
# for y in root_dict['Marvel']['Chaos War']:
#     print(y,' - ', root_dict['Marvel']['Chaos War'][y])

# root_dict

# CAUTION Permanent - Move Files

In [11]:
count_preexist = 0
count_moved = 0
dup_list = []
dup_files = []

df_move = df_global[df_global.Series.isin(df_import['Series'].unique().tolist())]

for row in tqdm(df_move.itertuples()):
    move_folder = row.NewPath
    _, file_name = os.path.split(row.FilePath)
    check_file = move_folder+file_name

    if row.FilePath != check_file:
        if not os.path.exists(move_folder):
            os.makedirs(move_folder)

        if os.path.exists(check_file):
            # print(f"Pre-Exists: {move_folder}{file_name} ")
            dup_list.append(row.index)
            dup_files.append(row.FilePath)
            count_preexist+=1
        else:
            try:
                shutil.move(row.FilePath, check_file)
                count_moved+=1
                df_global.at[row.Index, 'FilePath'] = check_file
            except:
                print(f'Error: {file_name} - {move_folder}')

# not working, error
# df_global = df_global.drop(dup_list, axis='index')

df_global.to_csv('comic_list.csv')

# for f in dup_files:
#     os.remove(f)
#
# print(f"Total:  {df_move.size} -  Unmoved: {count_preexist}   -   Moved: {count_moved}   -  Duplicates: {len(dup_files)}")

0it [00:00, ?it/s]

In [12]:
# Delete old empty folders

end_tree = []
count_deleted = 0

# Creates a list of all subfolders that are empty
for (path, dirs, files) in os.walk(HOME_PATH, topdown=True):
    if len(files) == 0 and len(dirs) == 0:
        end_tree.append((path, len(dirs)))

end_tree= sorted(end_tree, key=lambda x:x[1])

for folder in end_tree:
    try:
        os.rmdir(folder[0])
        count_deleted+=1
    except:
        print(f"{folder[0]} could not be deleted.")
print(f"Deleted: {count_deleted}")

Deleted: 0


In [None]:
# need to add a check for pre-existing folders that should not be deleted
# check for pre-existing folder but missing earlier issues that would lower the year value, copy move delete