In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
from transformers import pipeline
from tqdm import tqdm
import time

load_dotenv()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm





True

# 1) Continue from cleaning phase, showing the cardinalities of the cleaned dataset

In [2]:
multimodal_train_tsv_path = os.getenv('MULTIMODAL_TRAIN_CLEANED_WITH_CLASS_TSV')
multimodal_test_tsv_path = os.getenv('MULTIMODAL_TEST_CLEANED_WITH_CLASS_TSV')
multimodal_validation_tsv_path = os.getenv('MULTIMODAL_VAL_CLEANED_WITH_CLASS_TSV')

df_train = pd.read_csv(multimodal_train_tsv_path, sep='\t')
df_test = pd.read_csv(multimodal_test_tsv_path, sep='\t')
df_val = pd.read_csv(multimodal_validation_tsv_path, sep='\t')

train_counts = df_train['class'].value_counts()
test_counts = df_test['class'].value_counts()
val_counts = df_val['class'].value_counts()

print("Train Counts:")
print(train_counts)
print("\nTest Counts:")
print(test_counts)
print("\nValidation Counts:")
print(val_counts)

print("\n=> Total \"pristine\": "+ str(train_counts['pristine']+test_counts['pristine']+val_counts['pristine'])+" | Total \"fake\": "+str(train_counts['fake']+test_counts['fake']+val_counts['fake']))

Train Counts:
class
pristine    385871
fake        157126
Name: count, dtype: int64

Test Counts:
class
pristine    41567
fake        16480
Name: count, dtype: int64

Validation Counts:
class
pristine    40909
fake        16580
Name: count, dtype: int64

=> Total "pristine": 468347 | Total "fake": 190186


# 2) Select 250.000 pristine images to be captioned from train, test and val sets

I will select the first 20.000 from test, 20.000 from val, and 210.000 from train set.

In [3]:
# Select the first 210,000 pristine images from the train set
selected_train_pristine = df_train[df_train['class'] == 'pristine'].head(210000)

# Select the first 20,000 pristine images from the test set
selected_test_pristine = df_test[df_test['class'] == 'pristine'].head(20000)

# Select the first 20,000 pristine images from the validation set
selected_val_pristine = df_val[df_val['class'] == 'pristine'].head(20000)

print("Selected train pristine shape: ", selected_train_pristine.shape)
print("Selected test pristine shape: ", selected_test_pristine.shape)
print("Selected val pristine shape: ", selected_val_pristine.shape)


Selected train pristine shape:  (210000, 17)
Selected test pristine shape:  (20000, 17)
Selected val pristine shape:  (20000, 17)


Save the selected pristine of each set to csv

In [4]:
selected_train_pristine.to_csv("csv/selected_train_pristine.csv", index=False)
selected_test_pristine.to_csv("csv/selected_test_pristine.csv", index=False)
selected_val_pristine.to_csv("csv/selected_val_pristine.csv", index=False)

# 3) Define the ImageCaptioner class to be used to caption the pristine images from the sets

In [2]:
class ImageCaptioner:
    def __init__(self, device=0, model="Salesforce/blip-image-captioning-base"):
        self.device = device
        self.captioner = pipeline(
            "image-to-text",
            model=model,
            device=device,
        )

    def process_images(self, image_paths, batch_size=500, error_file="error_images.txt"):
        total_inference_time = 0
        result_dict = {"image_name": [], "caption": []}

        for i in range(0, len(image_paths), batch_size):
            batch_paths = image_paths[i:i + batch_size]

            with tqdm(total=len(batch_paths),
                    desc=f"Processing Batch {i // batch_size + 1}/{len(image_paths) // batch_size}", unit="image") as pbar:
                start_time = time.time()

                try:
                    captions = self.captioner(batch_paths, max_new_tokens=100)
                except Exception as ex:
                    if not os.path.exists(error_file):
                        with open(error_file, "w"):
                            pass 

                    with open(error_file, "a") as error_file_writer:
                        error_file_writer.write("\n".join(batch_paths) + "\n")
                    pbar.update(len(batch_paths))
                    continue  # Skip to the next iteration if an exception occurs

                end_time = time.time()
                batch_inference_time = end_time - start_time
                total_inference_time += batch_inference_time

                for path, caption in zip(batch_paths, captions):
                    image_name = os.path.splitext(os.path.basename(path))[0]  # remove .jpg
                    result_dict["image_name"].append(image_name)
                    result_dict["caption"].append(caption[0]["generated_text"])
                    pbar.update(1)

        avg_time_per_image = total_inference_time / len(image_paths)
        print(f"\nTotal Inference Time: {total_inference_time:.2f} seconds")
        print(f"Avg Time Per Image Caption: {avg_time_per_image:.4f} seconds")

        result_df = pd.DataFrame(result_dict)
        return result_df

In [3]:
captioner = ImageCaptioner(model="Salesforce/blip-image-captioning-large")
DATASET_DIR = os.getenv('DATASET_DIR')
BATCH_SIZE = int(os.getenv('BATCH_SIZE'))

Define the method to be called onto one set at a time, to caption all the images of a set (e.g. training set)

In [4]:
def caption_and_save_images(df, save_path):
    # Extract paths of selected pristine images
    pristine_paths = [os.path.join(DATASET_DIR, f"{image_name}.jpg") for image_name in df['id']]

    # Process and caption images
    generated_captions_df = captioner.process_images(image_paths=pristine_paths, batch_size=BATCH_SIZE)

    # Merge the original captions from df with the generated captions
    result_df = pd.merge(df, generated_captions_df, left_on="id", right_on="image_name", how="left")

    # Create a new DataFrame with 'id', 'original_caption', and 'caption'
    result_df = result_df[['id', 'clean_title', 'caption']]

    result_df = result_df.rename(columns={'clean_title': 'original_caption', 'caption': 'generated_caption'})
    result_df.to_csv(save_path, index=False, header=True, sep=',', encoding='utf-8')

# 4) Perform Captioning

Restore the csv of the selected pristine for each set

In [2]:
SELECTED_TRAIN_PRISTINE_CSV_PATH = os.getenv('SELECTED_TRAIN_PRISTINE_CSV_PATH')
SELECTED_TEST_PRISTINE_CSV_PATH = os.getenv('SELECTED_TEST_PRISTINE_CSV_PATH')
SELECTED_VAL_PRISTINE_CSV_PATH = os.getenv('SELECTED_VAL_PRISTINE_CSV_PATH')

selected_train_pristine = pd.read_csv(SELECTED_TRAIN_PRISTINE_CSV_PATH, sep=',')
selected_test_pristine = pd.read_csv(SELECTED_TEST_PRISTINE_CSV_PATH, sep=',')
selected_val_pristine = pd.read_csv(SELECTED_VAL_PRISTINE_CSV_PATH, sep=',')

In [4]:
TRAINING_PRISTINE_CAPTIONED = os.getenv('TRAINING_PRISTINE_CAPTIONED')
TEST_PRISTINE_CAPTIONED = os.getenv('TEST_PRISTINE_CAPTIONED')
VAL_PRISTINE_CAPTIONED = os.getenv('VAL_PRISTINE_CAPTIONED')

Caption 210.000 images from training set:

In [None]:
caption_and_save_images(selected_train_pristine, TRAINING_PRISTINE_CAPTIONED)

Caption 20.000 images from test set:

In [6]:
caption_and_save_images(selected_test_pristine, TEST_PRISTINE_CAPTIONED)

Processing Batch 1/40: 100%|██████████| 500/500 [01:32<00:00,  5.38image/s] 
Processing Batch 2/40: 100%|██████████| 500/500 [01:32<00:00,  5.39image/s] 
Processing Batch 3/40: 100%|██████████| 500/500 [01:33<00:00,  5.37image/s] 
Processing Batch 4/40: 100%|██████████| 500/500 [01:34<00:00,  5.30image/s] 
Processing Batch 5/40: 100%|██████████| 500/500 [01:33<00:00,  5.35image/s] 
Processing Batch 6/40: 100%|██████████| 500/500 [01:34<00:00,  5.27image/s] 
Processing Batch 7/40: 100%|██████████| 500/500 [01:35<00:00,  5.22image/s] 
Processing Batch 8/40: 100%|██████████| 500/500 [01:35<00:00,  5.25image/s] 
Processing Batch 9/40: 100%|██████████| 500/500 [01:35<00:00,  5.25image/s] 
Processing Batch 10/40: 100%|██████████| 500/500 [01:40<00:00,  4.97image/s]  
Processing Batch 11/40: 100%|██████████| 500/500 [01:35<00:00,  5.23image/s] 
Processing Batch 12/40: 100%|██████████| 500/500 [01:35<00:00,  5.22image/s] 
Processing Batch 13/40: 100%|██████████| 500/500 [01:35<00:00,  5.26imag


Total Inference Time: 3635.98 seconds
Avg Time Per Image Caption: 0.1818 seconds





Caption 20.000 images from validation set:

In [6]:
caption_and_save_images(selected_val_pristine, VAL_PRISTINE_CAPTIONED)

Processing Batch 1/40: 100%|██████████| 500/500 [01:29<00:00,  5.58image/s] 
Processing Batch 2/40: 100%|██████████| 500/500 [01:24<00:00,  5.92image/s] 
Processing Batch 3/40: 100%|██████████| 500/500 [01:28<00:00,  5.64image/s] 
Processing Batch 4/40: 100%|██████████| 500/500 [01:29<00:00,  5.58image/s] 
Processing Batch 5/40: 100%|██████████| 500/500 [01:27<00:00,  5.70image/s] 
Processing Batch 6/40: 100%|██████████| 500/500 [01:25<00:00,  5.86image/s] 
Processing Batch 7/40: 100%|██████████| 500/500 [01:27<00:00,  5.72image/s] 
Processing Batch 8/40: 100%|██████████| 500/500 [01:27<00:00,  5.70image/s] 
Processing Batch 9/40: 100%|██████████| 500/500 [01:26<00:00,  5.79image/s] 
Processing Batch 10/40: 100%|██████████| 500/500 [01:25<00:00,  5.82image/s] 
Processing Batch 11/40: 100%|██████████| 500/500 [01:29<00:00,  5.59image/s] 
Processing Batch 12/40: 100%|██████████| 500/500 [01:28<00:00,  5.64image/s] 
Processing Batch 13/40: 100%|██████████| 500/500 [01:27<00:00,  5.73image


Total Inference Time: 3529.75 seconds
Avg Time Per Image Caption: 0.1765 seconds





# 5) Fix buggy captions

Clean the buggy generated captions by eliminating the words containing the suffix "araf"

In [None]:
df = pd.read_csv(TRAINING_PRISTINE_CAPTIONED)

# Remove words containing "araf" from the 'generated_caption' column
df['generated_caption'] = df['generated_caption'].apply(lambda x: ' '.join([word for word in str(x).split() if 'araf' not in word]))

df.head()
df.to_csv(TRAINING_PRISTINE_CAPTIONED, index=False)

In [None]:
df = pd.read_csv(TEST_PRISTINE_CAPTIONED)

# Remove words containing "araf" from the 'generated_caption' column
df['generated_caption'] = df['generated_caption'].apply(lambda x: ' '.join([word for word in str(x).split() if 'araf' not in word]))

df.head()
df.to_csv(TEST_PRISTINE_CAPTIONED, index=False)

In [6]:
df = pd.read_csv(VAL_PRISTINE_CAPTIONED)

# Remove words containing "araf" from the 'generated_caption' column
df['generated_caption'] = df['generated_caption'].apply(lambda x: ' '.join([word for word in str(x).split() if 'araf' not in word]))

df.head()
df.to_csv(VAL_PRISTINE_CAPTIONED, index=False)

# 6) Create the final csv for the captioning phase

Generate 3 final csv where i merge the generated captions and some of the column of the selected_pristine csv.

The resulting csv will have the following structure:

id | author | original_caption | generated_caption | num_comments | class

In [3]:
SELECTED_TRAIN_PRISTINE_CSV_PATH = os.getenv('SELECTED_TRAIN_PRISTINE_CSV_PATH')
SELECTED_TEST_PRISTINE_CSV_PATH = os.getenv('SELECTED_TEST_PRISTINE_CSV_PATH')
SELECTED_VAL_PRISTINE_CSV_PATH = os.getenv('SELECTED_VAL_PRISTINE_CSV_PATH')

selected_train_pristine = pd.read_csv(SELECTED_TRAIN_PRISTINE_CSV_PATH, sep=',')
selected_test_pristine = pd.read_csv(SELECTED_TEST_PRISTINE_CSV_PATH, sep=',')
selected_val_pristine = pd.read_csv(SELECTED_VAL_PRISTINE_CSV_PATH, sep=',')

TRAINING_PRISTINE_CAPTIONED = os.getenv('TRAINING_PRISTINE_CAPTIONED')
TEST_PRISTINE_CAPTIONED = os.getenv('TEST_PRISTINE_CAPTIONED')
VAL_PRISTINE_CAPTIONED = os.getenv('VAL_PRISTINE_CAPTIONED')

train_pristine_captioned = pd.read_csv(TRAINING_PRISTINE_CAPTIONED, sep=',')
test_pristine_captioned = pd.read_csv(TEST_PRISTINE_CAPTIONED, sep=',')
val_pristine_captioned = pd.read_csv(VAL_PRISTINE_CAPTIONED, sep=',')

In [7]:
selected_train_pristine.head()

Unnamed: 0,author,clean_title,created_utc,domain,hasImage,id,image_url,linked_submission_id,num_comments,score,subreddit,title,upvote_ratio,2_way_label,3_way_label,6_way_label,class
0,Alexithymia,my walgreens offbrand mucinex was engraved wit...,1551641000.0,i.imgur.com,True,awxhir,https://external-preview.redd.it/WylDbZrnbvZdB...,,2.0,12,mildlyinteresting,My Walgreens offbrand Mucinex was engraved wit...,0.84,1,0,0,pristine
1,VIDCAs17,this concerned sink with a tiny hat,1534727000.0,i.redd.it,True,98pbid,https://preview.redd.it/wsfx0gp0f5h11.jpg?widt...,,2.0,119,pareidolia,This concerned sink with a tiny hat,0.99,0,2,2,pristine
2,prometheus1123,hackers leak emails from uae ambassador to us,1496511000.0,aljazeera.com,True,6f2cy5,https://external-preview.redd.it/6fNhdbc6K1vFA...,,1.0,44,neutralnews,Hackers leak emails from UAE ambassador to US,0.92,1,0,0,pristine
3,,puppy taking in the view,1471341000.0,i.imgur.com,True,4xypkv,https://external-preview.redd.it/HLtVNhTR6wtYt...,,26.0,250,photoshopbattles,PsBattle: Puppy taking in the view,0.95,1,0,0,pristine
4,3rikR3ith,i found a face in my sheet music too,1525318000.0,i.redd.it,True,8gnet9,https://preview.redd.it/ri7ut2wn8kv01.jpg?widt...,,2.0,13,pareidolia,I found a face in my sheet music too!,0.84,0,2,2,pristine


In [6]:
train_pristine_captioned.head()

Unnamed: 0,id,original_caption,generated_caption
0,awxhir,my walgreens offbrand mucinex was engraved wit...,a close up of a person holding a pill in their...
1,98pbid,this concerned sink with a tiny hat,there is a white sink with a soap dispenser on it
2,6f2cy5,hackers leak emails from uae ambassador to us,man in a suit sitting in a chair in a room
3,4xypkv,puppy taking in the view,there is a dog that is sitting in the grass lo...
4,8gnet9,i found a face in my sheet music too,a close up of a sheet of music with notes and ...


now i merge the csv in order to produce 3 final ones

In [2]:
TRAIN_CAPTIONED_FINAL = os.getenv('TRAIN_CAPTIONED_FINAL')
TEST_CAPTIONED_FINAL = os.getenv('TEST_CAPTIONED_FINAL')
VAL_CAPTIONED_FINAL = os.getenv('VAL_CAPTIONED_FINAL')

In [4]:
import pandas as pd

# Merge join on 'id' column
merged_df = pd.merge(selected_train_pristine[['id', 'author', 'num_comments', 'class']], 
                     train_pristine_captioned[['id', 'original_caption', 'generated_caption']], 
                     on='id', how='inner')

# Reorder columns
result_df = merged_df[['id', 'author', 'original_caption', 'generated_caption', 'num_comments', 'class']]

result_df.to_csv(TRAIN_CAPTIONED_FINAL, index=False)
result_df.head()

Unnamed: 0,id,author,original_caption,generated_caption,num_comments,class
0,awxhir,Alexithymia,my walgreens offbrand mucinex was engraved wit...,a close up of a person holding a pill in their...,2.0,pristine
1,98pbid,VIDCAs17,this concerned sink with a tiny hat,there is a white sink with a soap dispenser on it,2.0,pristine
2,6f2cy5,prometheus1123,hackers leak emails from uae ambassador to us,man in a suit sitting in a chair in a room,1.0,pristine
3,4xypkv,,puppy taking in the view,there is a dog that is sitting in the grass lo...,26.0,pristine
4,8gnet9,3rikR3ith,i found a face in my sheet music too,a close up of a sheet of music with notes and ...,2.0,pristine


In [5]:
merged_df = pd.merge(selected_test_pristine[['id', 'author', 'num_comments', 'class']], 
                     test_pristine_captioned[['id', 'original_caption', 'generated_caption']], 
                     on='id', how='inner')

result_df = merged_df[['id', 'author', 'original_caption', 'generated_caption', 'num_comments', 'class']]

result_df.to_csv(TEST_CAPTIONED_FINAL, index=False)
result_df.head()

Unnamed: 0,id,author,original_caption,generated_caption,num_comments,class
0,c0gl7r,chaseoes,pd phoenix car thief gets instructions from yo...,a man is shown in a police photo and a yellow ...,2.0,pristine
1,c0xdqy,SFepicure,as trump accuses iran he has one problem his o...,a close up of a person in a suit and tie speak...,4.0,pristine
2,7o9rmx,fragments_from_Work,believers hezbollah,a close up of a soldier in a field with a rifle,40.0,pristine
3,bdfxf1,SovietTurnipFarmer,the rise of italian fascism circa,a screenshot of a cell phone showing a picture...,2.0,pristine
4,8g3xtm,HR_Paperstacks_402,trumps pick to lead ice who touted surge in im...,the washington post logo with a statue of abra...,1.0,pristine


In [6]:
merged_df = pd.merge(selected_val_pristine[['id', 'author', 'num_comments', 'class']], 
                     val_pristine_captioned[['id', 'original_caption', 'generated_caption']], 
                     on='id', how='inner')

result_df = merged_df[['id', 'author', 'original_caption', 'generated_caption', 'num_comments', 'class']]

result_df.to_csv(VAL_CAPTIONED_FINAL, index=False)
result_df.head()

Unnamed: 0,id,author,original_caption,generated_caption,num_comments,class
0,cypw96,singingdart7854,my xbox controller says hi,someone is holding a remote control for a vide...,4.0,pristine
1,d0bzlq,mandal0re,new image from the mandalorian,there are three people standing on a porch of ...,5.0,pristine
2,bq3yuk,Thebubster2001,this tree i found with a solo cup on it,there is a red frisbee that is stuck in the trees,8.0,pristine
3,8rsoq0,jokkstermokkster,dude id feel the same if i got a pole through ...,there is a close up of a window with a small f...,0.0,pristine
4,33sekv,alleged_redditor,japanese black pine tree,bonsai tree in a pot on a stone pedestal,2.0,pristine
