**Description**:  
This code helps cleaning filenames and create a new file with the required metadata information in the following structure:  
    {  
        "original_filename":"",  
        "filename_without_timestamp":"",  
        "timestamp":"",  
        "repo_owner":"",  
        "repo_name":""  
    }  

Due to inconsistencies in the repository naming, unclear names for the variables repo_owner and repo_name occured.  
The file **data/helper/helper_repos_metadata.json** was created to provide clear metadata of each repository. The names of the orginal metadata 
were not changed but manuelly checked with automated created ones from this notebook.

In [None]:
import pandas as pd # package for data manipulation
import json # package to work with .json
import os # package to work with operating system
from pathlib import Path # package for path manipulation
import re # package to work with regex pattern
import shutil # package for path manipulation
import zipfile # package to work with ZIP files

In [7]:
path = Path('../data/repo_data_zip')

In [66]:
pattern = r"_(\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}).zip"

In [None]:
data = []
for file in path.iterdir(): # iterate through all files in provided path
    if file.is_file(): # check if item is a file
        match = re.search(pattern, file.name) # search for filename with regex pattern
        timestamp = match.group(1) if match else None  # handle cases with no match
        filename_without_timestamp = re.sub(pattern, '', file.name) # remove timestamp from filename
        data.append((file.name, filename_without_timestamp, timestamp)) # append data to list

In [None]:
df = pd.DataFrame(data, columns= ['original_filename', 'filename_without_timestamp', 'timestamp']) # create a df

In [23]:
df['filename_without_timestamp']

0                                        facebook_chisel
1                             microsoft_Swin-Transformer
2                                         openai_point-e
3                                         THUDM_ChatGLM3
4      PacktPublishing_Deep-Reinforcement-Learning-Ha...
                             ...                        
196                                ArchiveBox_ArchiveBox
197                              yerfor_GeneFacePlusPlus
198                                 myshell-ai_OpenVoice
199                           facebookresearch_Detectron
200                                 zylon-ai_private-gpt
Name: filename_without_timestamp, Length: 201, dtype: object

In [None]:
df['repo_owner'] = df['filename_without_timestamp'].apply(lambda x: x.split('_')[0]) # extract repo_owner from filename_without_timestamp

In [None]:
df['repo_name'] = df['filename_without_timestamp'].apply(lambda x: x.split('_')[1]) # extract repo_name from filename_without_timestamp

In [None]:
# df.to_json('../data/helper/helper_repos_metadata_2.json', orient='records', indent=4)

### Test

In [68]:
test_str = 'ageitgey_face_recognition_2025-06-07_13-25-45.zip'
remove_timestamp = re.sub(pattern, '', test_str)
remove_timestamp

'ageitgey_face_recognition'

In [None]:
repo_owner = remove_timestamp.split('_')[0]
repo_owner

'ageitgey'

In [71]:
repo_name = remove_timestamp.split('_')[1]
repo_name

'face'

In [2]:
with open('../data/helper/helper_repos_metadata.json', 'r') as f:
    loaded_metadata = json.load(f)

In [7]:
#path_to_test = 'facebook_chisel_2025-06-07_13-23-17.zip'
path_to_test = 'cookiecutter_cookiecutter_2025-06-07_13-18-47.zip'

In [75]:
loaded_metadata[0]

{'original_filename': 'facebook_chisel_2025-06-07_13-23-17.zip',
 'filename_without_timestamp': 'facebook_chisel',
 'timestamp': '2025-06-07_13-23-17',
 'repo_owner': 'facebook',
 'repo_name': 'chisel'}

In [8]:


#cnt = 1
for dictionary in loaded_metadata:
    # if cnt >= 2:
    #     break
    if dictionary['original_filename'] == path_to_test:
        print('it fits')

        repo_owner = dictionary['repo_owner']
        repo_name = dictionary['repo_name']

    #cnt += 1


it fits


In [9]:
repo_name

'cookiecutter'