In [1]:
import zipfile as zf
files = zf.ZipFile("/content/memes.zip", 'r')
files.extractall('meme')
files.close()

In [3]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import os
import pandas as pd
import easyocr

# Initialize EasyOCR Reader (for English text)
reader = easyocr.Reader(['en', 'hi', 'mr'], gpu=True)

# Initialize the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the meme image directory
meme_directory = "/content/meme/memes"

# Example function to classify toxicity based on image and text
def classify_toxicity(image_path, text):
    # Open and preprocess the image
    image = Image.open(image_path)

    # Prepare the inputs (image and text) with padding and truncation
    inputs = processor(
        text=[text],
        images=image,
        return_tensors="pt",
        padding=True,  # Padding the sequence to the maximum length
        truncation=True  # Truncating if the sequence exceeds the max length
    )

    # Get the outputs from the model
    outputs = model(**inputs)

    # Extract the image and text features
    image_features = outputs.image_embeds
    text_features = outputs.text_embeds

    # Calculate the similarity score (cosine similarity)
    similarity = torch.cosine_similarity(image_features, text_features)

    # Define a threshold for toxicity (example threshold)
    toxicity_threshold = 0.4

    # If the similarity score is below the threshold, classify as toxic
    label = "Toxic" if similarity < toxicity_threshold else "Non-toxic"

    return label, similarity.item()

# Function to extract text from image using EasyOCR
def extract_text_from_image(image_path):
    # Perform OCR using EasyOCR
    result = reader.readtext(image_path)
    # Combine all the text found in the image
    text = " ".join([entry[1] for entry in result])
    return text.strip()

# Process all meme images in the directory
results = []
for filename in os.listdir(meme_directory):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        image_path = os.path.join(meme_directory, filename)

        # Extract text from image
        text = extract_text_from_image(image_path)

        # If no text was extracted, skip the image
        if not text:
            continue

        # Classify toxicity based on image and extracted text
        label, score = classify_toxicity(image_path, text)

        # Store results
        results.append({
            "image": filename,
            "extracted_text": text,
            "toxicity_label": label,
            "toxicity_score": score
        })

# Convert results into a DataFrame
df = pd.DataFrame(results)

# Save results to a CSV file
df.to_csv("/content/meme/memes/meme_toxicity_results.csv", index=False)

# Print the DataFrame if needed
print(df.head())




Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

              image                                     extracted_text  \
0    520_NM_pic.jpg  IGdevilmemez_81७ cooamdep0 २s८l Ic०$ Monkeys क...   
1  151_M_pic_10.jpg  philmyyy We're both dead, but if we're born ag...   
2     43_NM_pic.jpg  When my Bestfriend doesn t tie a Friendship ba...   
3    514_NM_pic.jpg  *Someone :- Tum apne doston ko kis naam se bul...   
4   506_M_pic_2.jpg  १४ Y Ola Nibbi After Sex eleosturk Aaj Usne Mu...   

  toxicity_label  toxicity_score  
0          Toxic        0.372312  
1          Toxic        0.294317  
2          Toxic        0.311084  
3          Toxic        0.308909  
4          Toxic        0.297873  


In [2]:
pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (422 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m422.9/422.9 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (

In [None]:
from google.colab import drive
drive.mount('/content/drive')