In [1]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import numpy as np

# Load the CLIP processor and model
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")


In [3]:
def classify_toxicity(text):
    # Define the label texts (toxic, non-toxic)
    labels = ["toxic", "non-toxic"]
    
    # Preprocess the text (text input for CLIP)
    inputs = processor(text=[text] * len(labels), text_pair=labels, return_tensors="pt", padding=True)
    
    # Get the text features
    outputs = model(**inputs)
    text_features = outputs.text_embeds  # Get the text embeddings from the model
    
    # Calculate cosine similarities between the input text and each label
    similarity_scores = torch.nn.functional.cosine_similarity(text_features[0], outputs.text_embeds[1:])
    
    # Get the label with the highest similarity score
    predicted_label = labels[similarity_scores.argmax().item()]
    toxicity_score = similarity_scores.max().item()
    
    return predicted_label, toxicity_score


In [5]:
import pandas as pd

# Load your CSV data
file_path = "E:\\BE Project\\archive (3)\\MIMIC2024 ee.csv"
data = pd.read_csv(file_path, encoding="latin1")

# Handle missing values
data = data.dropna(subset=["ExtractedText"])
data["ExtractedText"] = data["ExtractedText"].fillna("")

# Define a function to apply the model
def apply_clip_model(row):
    text = row["ExtractedText"]
    label, score = classify_toxicity(text)
    return label, score

# Apply the function to the dataset
data["toxicity_label"], data["toxicity_score"] = zip(*data.apply(apply_clip_model, axis=1))

# View the results
print(data.head())


ValueError: You have to specify pixel_values

In [9]:
# import torch
# from transformers import CLIPProcessor, CLIPModel
# import pandas as pd

# # Load the CLIP processor and model
# from transformers import CLIPProcessor, CLIPModel

# Load pre-trained model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

def classify_toxicity(text):
    # Define the labels for toxicity classification
    labels = ["toxic", "non-toxic"]
    
    # Preprocess the text and truncate if necessary
    inputs = processor(text=[text] * len(labels), text_pair=labels, return_tensors="pt", padding=True, truncation=True, max_length=77)
    
    # Get the text features from the model
    outputs = model.get_text_features(**inputs)
    
    # Preprocess the label texts
    label_inputs = processor(text=labels, return_tensors="pt", padding=True, truncation=True, max_length=77)
    
    # Get the label features
    label_outputs = model.get_text_features(**label_inputs)
    
    # Calculate cosine similarities between the input text and each label
    cosine_similarities = torch.nn.functional.cosine_similarity(outputs, label_outputs)
    
    # Get the label with the highest similarity score
    max_similarity_idx = torch.argmax(cosine_similarities).item()
    label = labels[max_similarity_idx]
    score = cosine_similarities[max_similarity_idx].item()
    
    return label, score

# Apply the function to the dataset
data["toxicity_label"], data["toxicity_score"] = zip(*data.apply(apply_clip_model, axis=1))

# View the results
print(data.head())


# Save the results to a CSV
output_path = "E:\\BE Project\\processed_clip_toxicity_scores.csv"
data.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")


config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

            FileName                                      ExtractedText  \
0   861_M_pic_12.jpg                    haldi wala doodh turmeric latte   
1  2171_M_pic_12.jpg                                à¥­OKA BOLO QDOUDDO   
2     313_NM_pic.jpg  Kuch Mahino baad ye bhi Maa ban jayegi! Taimur...   
3   1458_M_pic_1.jpg                          Mujhe to kuch aur hi laga   
4    2018_NM_pic.jpg  Teacher tum kal school kyu nhi aaye the Me kyu...   

  toxicity_label  toxicity_score  
0          toxic        0.548819  
1      non-toxic        0.779532  
2      non-toxic        0.650416  
3      non-toxic        0.769572  
4      non-toxic        0.775010  
Results saved to E:\BE Project\processed_clip_toxicity_scores.csv


In [10]:
output_path = "E:\\BE Project\\processed_clip_toxicity_scores.csv"
data.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")


Results saved to E:\BE Project\processed_clip_toxicity_scores.csv


In [13]:
# View the top toxic comments
top_toxic = data[data["toxicity_label"] == "toxic"].sort_values(by="toxicity_score", ascending=False)
print(top_toxic[["ExtractedText", "toxicity_score"]])

# Filter based on a toxicity threshold
threshold = 0.7  # Define a toxicity threshold
toxic_comments = data[data["toxicity_score"] > threshold]
print(f"Number of toxic comments: {len(toxic_comments)}")


                                          ExtractedText  toxicity_score
1855                                      mind to Me My        0.875959
3374                                           0 YAIS H        0.863691
3528                                         Imemesl Ia        0.863364
3801                                              POLIC        0.863350
1166                                                  ^        0.862748
...                                                 ...             ...
4768  Bajrang Dal worker beats himself with a stick ...        0.462637
3423  HANSIKA MOTANI IN KOI MIL GAYA AAPKA SUROOR (2...        0.424970
451   Hello Mahi Bhai Hello Cheeku Bhai HEHE GEVTRAL...        0.423248
1758       Hollywood Ka Pankaj Tripathi @conlused.aatma        0.413148
2177  ARYAN KHAN ON DINNER TABLE AT MANNAT Papa hum ...        0.405317

[1303 rows x 2 columns]
Number of toxic comments: 3993


In [22]:
# import torch
# from transformers import CLIPProcessor, CLIPModel
# from PIL import Image
# import os
# import pandas as pd

# Initialize the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the meme image directory
meme_directory = "E:/BE Project/archive (3)/memes"

# Example function to classify toxicity based on image and text
def classify_toxicity(image_path, text):
    # Open and preprocess the image
    image = Image.open(image_path)
    
    # Prepare the inputs (image and text)
    inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
    
    # Get the outputs from the model
    outputs = model(**inputs)
    
    # Extract the image and text features
    image_features = outputs.image_embeds
    text_features = outputs.text_embeds
    
    # Calculate the similarity score (cosine similarity)
    similarity = torch.cosine_similarity(image_features, text_features)
    
    # Define a threshold for toxicity (example threshold)
    toxicity_threshold = 0.7
    
    # If the similarity score is below the threshold, classify as toxic
    label = "Toxic" if similarity < toxicity_threshold else "Non-toxic"
    
    return label, similarity.item()

# Process all meme images in the directory
results = []
for filename in os.listdir(meme_directory):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        image_path = os.path.join(meme_directory, filename)
        text = "some text from the meme caption"  # Replace this with the actual text from the meme
        
        label, score = classify_toxicity(image_path, text)
        
        # Store results
        results.append({
            "image": filename,
            "toxicity_label": label,
            "toxicity_score": score
        })

# Convert results into a DataFrame
df = pd.DataFrame(results)

# Save results to a CSV file
df.to_csv("E:\\BE Project\\meme_toxicity_results.csv", index=False)

# You can also print the DataFrame if needed
print(df.head())


  meme_directory = "E:\BE Project\archive (3)\Files"
  meme_directory = "E:\BE Project\archive (3)\Files"


OSError: [WinError 123] The filename, directory name, or volume label syntax is incorrect: 'E:\\BE Project\x07rchive (3)\\Files'

In [19]:
import os
print(os.getcwd())


C:\Users\SAGAR_007


In [24]:
pip install pytesseract

Note: you may need to restart the kernel to use updated packages.


In [38]:
# import torch
# from transformers import CLIPProcessor, CLIPModel
# from PIL import Image
# import os
# import pandas as pd
import pytesseract

# Path to Tesseract executable (make sure it's correctly set for your system)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Change the path accordingly

# Initialize the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the meme image directory
meme_directory = "E:/BE Project/archive (3)/memes"

# Example function to classify toxicity based on image and text
def classify_toxicity(image_path, text):
    # Open and preprocess the image
    image = Image.open(image_path)
    
    # Prepare the inputs (image and text) with padding and truncation
    inputs = processor(
        text=[text], 
        images=image, 
        return_tensors="pt", 
        padding=True,  # Padding the sequence to the maximum length
        truncation=True  # Truncating if the sequence exceeds the max length
    )
    
    # Get the outputs from the model
    outputs = model(**inputs)
    
    # Extract the image and text features
    image_features = outputs.image_embeds
    text_features = outputs.text_embeds
    
    # Calculate the similarity score (cosine similarity)
    similarity = torch.cosine_similarity(image_features, text_features)
    
    # Define a threshold for toxicity (example threshold)
    toxicity_threshold = 0.4
    
    # If the similarity score is below the threshold, classify as toxic
    label = "Toxic" if similarity < toxicity_threshold else "Non-toxic"
    
    return label, similarity.item()

# Function to extract text from image using Tesseract OCR
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text.strip()

# Process all meme images in the directory
results = []
for filename in os.listdir(meme_directory):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        image_path = os.path.join(meme_directory, filename)
        
        # Extract text from image
        text = extract_text_from_image(image_path)
        
        # If no text was extracted, skip the image
        if not text:
            continue
        
        # Classify toxicity based on image and extracted text
        label, score = classify_toxicity(image_path, text)
        
        # Store results
        results.append({
            "image": filename,
            "extracted_text": text,
            "toxicity_label": label,
            "toxicity_score": score
        })

# Convert results into a DataFrame
df = pd.DataFrame(results)

# Save results to a CSV file
df.to_csv("meme_toxicity_results.csv", index=False)

# Print the DataFrame if needed
print(df.head())


              image                                     extracted_text  \
0   100_M_pic_1.jpg  Abba me ane! hona chahta hu; b!\n"jail meygahu...   
1  101_M_pic_10.jpg                     One Day with my two wife “@:@3   
2    101_NM_pic.jpg  ©\n>\n)\nis\nGS\n2\na}\n£\ni)\n=\n@\n))\n=\nc=...   
3    102_NM_pic.jpg  Cu,\nbe x oe |)\nDUNIYA HILA DENGE HUM.\nIG:@C...   
4    103_NM_pic.jpg  Bachapanika dard\n\nBhut — dard — .\n\nJb ag k...   

  toxicity_label  toxicity_score  
0          Toxic        0.234632  
1          Toxic        0.261696  
2          Toxic        0.220505  
3          Toxic        0.291710  
4          Toxic        0.302451  


In [40]:
pip install easyocr


Note: you may need to restart the kernel to use updated packages.Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting opencv-python-headless (from easyocr)
  Downloading opencv_python_headless-4.11.0.86-cp37-abi3-win_amd64.whl.metadata (20 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.3-cp312-none-win_amd64.whl.metadata (5.0 kB)
Collecting Shapely (from easyocr)
  Downloading shapely-2.0.6-cp312-cp312-win_amd64.whl.metadata (7.2 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp312-cp312-win_amd64.whl.metadata (9.2 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.3-py3-none-win_amd64.whl.metadata (5.3 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
   ---------------------------------------- 0.0/2.9 MB ? eta -:--:--
    --------------------------------------- 0.1/2.9 MB 1.7 MB/s eta 0:00:02
   --- ------------------------------------ 0.2/2.9 MB 3.0 MB/s eta 0:00:01
   ----

In [42]:
# import torch
# from transformers import CLIPProcessor, CLIPModel
# from PIL import Image
# import os
# import pandas as pd
import easyocr

# Initialize EasyOCR Reader (for English text)
reader = easyocr.Reader(['en'])

# Initialize the CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the meme image directory
meme_directory = "E:/BE Project/archive (3)/memes"

# Example function to classify toxicity based on image and text
def classify_toxicity(image_path, text):
    # Open and preprocess the image
    image = Image.open(image_path)
    
    # Prepare the inputs (image and text) with padding and truncation
    inputs = processor(
        text=[text], 
        images=image, 
        return_tensors="pt", 
        padding=True,  # Padding the sequence to the maximum length
        truncation=True  # Truncating if the sequence exceeds the max length
    )
    
    # Get the outputs from the model
    outputs = model(**inputs)
    
    # Extract the image and text features
    image_features = outputs.image_embeds
    text_features = outputs.text_embeds
    
    # Calculate the similarity score (cosine similarity)
    similarity = torch.cosine_similarity(image_features, text_features)
    
    # Define a threshold for toxicity (example threshold)
    toxicity_threshold = 0.4
    
    # If the similarity score is below the threshold, classify as toxic
    label = "Toxic" if similarity < toxicity_threshold else "Non-toxic"
    
    return label, similarity.item()

# Function to extract text from image using EasyOCR
def extract_text_from_image(image_path):
    # Perform OCR using EasyOCR
    result = reader.readtext(image_path)
    # Combine all the text found in the image
    text = " ".join([entry[1] for entry in result])
    return text.strip()

# Process all meme images in the directory
results = []
for filename in os.listdir(meme_directory):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        image_path = os.path.join(meme_directory, filename)
        
        # Extract text from image
        text = extract_text_from_image(image_path)
        
        # If no text was extracted, skip the image
        if not text:
            continue
        
        # Classify toxicity based on image and extracted text
        label, score = classify_toxicity(image_path, text)
        
        # Store results
        results.append({
            "image": filename,
            "extracted_text": text,
            "toxicity_label": label,
            "toxicity_score": score
        })

# Convert results into a DataFrame
df = pd.DataFrame(results)

# Save results to a CSV file
df.to_csv("meme_toxicity_results.csv", index=False)

# Print the DataFrame if needed
print(df.head())


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete               image                                     extracted_text  \
0    100_M_pic_1.jpg  IComedyculluein Abba me arrest hona chahta hu,...   
1     100_NM_pic.jpg  IG/HASTE RAHO) Girliriendbanjao DairgMikSilk g...   
2   101_M_pic_10.jpg                           One with my two wife Day   
3     101_NM_pic.jpg  No one Relatives when come in our house Tu peh...   
4  102_M_pic_123.jpg          Unk futwe jinke 45 Bf hens 2 oGallu @yall   

  toxicity_label  toxicity_score  
0          Toxic        0.253802  
1          Toxic        0.298624  
2          Toxic        0.257361  
3          Toxic        0.340348  
4          Toxic        0.235905  
