In [10]:
# %%
import pandas as pd

# Read the data
df = pd.read_csv('Data/Data.csv')
with open('Data/selected_row_ids.txt', 'r') as f:
    selected_ids = [line.strip() for line in f.readlines()]

# Convert IDs to strings
df['Id'] = df['Id'].astype(str)
selected_ids = [str(id_) for id_ in selected_ids]

# Create filtered dataframe
filtered_df = df[df['Id'].isin(selected_ids)]

# Find which IDs don't have matches
missing_ids = [id_ for id_ in selected_ids if id_ not in df['Id'].values]

print(f"Total selected IDs: {len(selected_ids)}")
print(f"Successfully matched IDs: {len(filtered_df)}")
print(f"Missing IDs: {len(missing_ids)}")
print("\nList of IDs not found in the DataFrame:")
for id_ in missing_ids:
    print(id_)

# Optional: Save the filtered DataFrame
filtered_df.to_csv('Data/filtered_data.csv', index=False)

Total selected IDs: 158
Successfully matched IDs: 152
Missing IDs: 0

List of IDs not found in the DataFrame:


In [11]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 152 entries, 2 to 2613
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             152 non-null    int64  
 1   Id                     152 non-null    object 
 2   PostTypeId             152 non-null    int64  
 3   AcceptedAnswerId       35 non-null     float64
 4   ParentId               0 non-null      float64
 5   CreationDate           152 non-null    object 
 6   DeletionDate           0 non-null      float64
 7   Score                  152 non-null    int64  
 8   ViewCount              152 non-null    int64  
 9   Body                   152 non-null    object 
 10  OwnerUserId            152 non-null    float64
 11  OwnerDisplayName       0 non-null      object 
 12  LastEditorUserId       80 non-null     float64
 13  LastEditorDisplayName  0 non-null      float64
 14  LastEditDate           80 non-null     object 
 15  LastActivi

In [12]:
import pandas as pd

filtered_df = pd.read_csv('Data/filtered_data.csv')

In [13]:
filtered_df.columns

Index(['Unnamed: 0', 'Id', 'PostTypeId', 'AcceptedAnswerId', 'ParentId',
       'CreationDate', 'DeletionDate', 'Score', 'ViewCount', 'Body',
       'OwnerUserId', 'OwnerDisplayName', 'LastEditorUserId',
       'LastEditorDisplayName', 'LastEditDate', 'LastActivityDate', 'Title',
       'Tags', 'AnswerCount', 'CommentCount', 'FavoriteCount', 'ClosedDate',
       'CommunityOwnedDate', 'ContentLicense', 'CodeText', 'ImageURLs'],
      dtype='object')

In [15]:
# %%
import pandas as pd
import os
import requests
import ast
import urllib.parse
from pathlib import Path

# Create images folder if it doesn't exist
images_folder = Path('Data/images')
images_folder.mkdir(exist_ok=True)

# Read the filtered data
df = pd.read_csv('Data/filtered_data.csv')

def extract_filename_from_url(url):
    """Extract the filename from the URL"""
    return os.path.basename(urllib.parse.urlparse(url).path)

def download_images(row):
    """Download images for a single row"""
    try:
        # Convert string representation of list to actual list
        # Handle both string representations and empty cells
        if pd.isna(row['ImageURLs']) or row['ImageURLs'] == '[]':
            return
        
        urls = ast.literal_eval(row['ImageURLs'])
        if not urls:  # Skip if empty list
            return
        
        for url in urls:
            try:
                # Extract original filename
                original_filename = extract_filename_from_url(url)
                
                # Create new filename with ID prefix
                new_filename = f"{row['Id']}_{original_filename}"
                
                # Download image
                response = requests.get(url, timeout=10)
                response.raise_for_status()  # Raise exception for bad status codes
                
                # Save image
                image_path = images_folder / new_filename
                with open(image_path, 'wb') as f:
                    f.write(response.content)
                    
                print(f"Successfully downloaded: {new_filename}")
                
            except requests.RequestException as e:
                print(f"Error downloading image from {url}: {str(e)}")
            except Exception as e:
                print(f"Error processing URL {url}: {str(e)}")
                
    except (ValueError, SyntaxError) as e:
        print(f"Error parsing ImageURLs for ID {row['Id']}: {str(e)}")
    except Exception as e:
        print(f"Unexpected error processing row {row['Id']}: {str(e)}")

# %% [markdown]
# ## Download Images
# This cell will process each row and download the images

# %%
# Show total number of rows to process
print(f"Processing {len(df)} rows...")

# Process each row
for index, row in df.iterrows():
    print(f"\nProcessing row {index + 1}/{len(df)} (ID: {row['Id']})")
    download_images(row)

print("\nImage download process completed!")

# %% [markdown]
# ## Summary Statistics

# %%
# Count how many images were downloaded
downloaded_images = len(list(images_folder.glob('*.png')))
print(f"\nSummary:")
print(f"Total rows processed: {len(df)}")
print(f"Total images downloaded: {downloaded_images}")
print(f"Images saved in: {images_folder.absolute()}")

Processing 152 rows...

Processing row 1/152 (ID: 79146548)
Successfully downloaded: 79146548_MgGjdapB.png

Processing row 2/152 (ID: 79146419)
Successfully downloaded: 79146419_THwNK2Jj.png

Processing row 3/152 (ID: 79146412)
Successfully downloaded: 79146412_efq4SfvI.png

Processing row 4/152 (ID: 79146127)


Successfully downloaded: 79146127_Jp5wj6k2.png

Processing row 5/152 (ID: 79145758)
Successfully downloaded: 79145758_19LCKEF3.png

Processing row 6/152 (ID: 79145106)
Successfully downloaded: 79145106_82bgJS6T.png

Processing row 7/152 (ID: 79144988)
Successfully downloaded: 79144988_EDq0aasZ.png

Processing row 8/152 (ID: 79144491)
Successfully downloaded: 79144491_cWiaxG7g.png

Processing row 9/152 (ID: 79144476)
Successfully downloaded: 79144476_FFZhjMVo.png

Processing row 10/152 (ID: 79144187)
Successfully downloaded: 79144187_Hh8PTnOy.png

Processing row 11/152 (ID: 79144165)
Successfully downloaded: 79144165_xhBldoiI.png

Processing row 12/152 (ID: 79144096)
Successfully downloaded: 79144096_TM0RkPbJ.jpg

Processing row 13/152 (ID: 79144079)
Successfully downloaded: 79144079_wzeQJ3Y8.png

Processing row 14/152 (ID: 79144053)
Successfully downloaded: 79144053_DxV5NS4E.png

Processing row 15/152 (ID: 79144033)
Successfully downloaded: 79144033_nzNZv4PN.png

Processing row 16/152 

In [16]:
import os
import pandas as pd

def create_image_csv(image_dir, output_file):
    # Get all image files from the directory
    image_files = [f for f in os.listdir(image_dir) if f.endswith('.png')]
    
    # Create lists to store data
    image_names = []
    ids = []
    
    # Parse each filename
    for image_name in image_files:
        # Split at underscore and take the first part as ID
        id_part = image_name.split('_')[0]
        
        image_names.append(image_name)
        ids.append(id_part)
    
    # Create DataFrame
    df = pd.DataFrame({
        'image_name': image_names,
        'id': ids
    })
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    return df

# Usage
image_dir = "./Data/images"
output_file = "./Data/metadata.csv"

df = create_image_csv(image_dir, output_file)
print("CSV file created successfully!")
print("\nFirst few rows:")
print(df.head())

CSV file created successfully!

First few rows:
              image_name        id
0  79050514_GPGUHO9Q.png  79050514
1  79042213_bZgBlMAU.png  79042213
2  79142755_eAw8IiDv.png  79142755
3  79082901_V0hdKGst.png  79082901
4  79089625_AylR3h8J.png  79089625
