In [None]:
pip install tensorflow numpy pillow matplotlib




In [None]:
import numpy as np
import pandas as pd
import os
import string
from keras.preprocessing.image import load_img, img_to_array
from keras.applications import ResNet50
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Add
from keras.optimizers import Adam
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from PIL import Image

# Function to load and process the image
def preprocess_image(image_path):
    # Load the image with size 224x224 (ResNet50 expects this input size)
    image = load_img(image_path, target_size=(224, 224))
    # Convert the image to a numpy array
    image = img_to_array(image)
    # Rescale the image (ResNet50 model expects input scaled by 1/255)
    image = image / 255.0
    # Expand dimensions to match ResNet50 input shape
    image = np.expand_dims(image, axis=0)
    return image

# Load pre-trained ResNet50 for feature extraction
def extract_features(image_path, model):
    image = preprocess_image(image_path)
    # Extract features using the pre-trained ResNet50 model
    feature = model.predict(image)
    return feature

# Prepare the tokenizer for caption generation
def fit_tokenizer(captions):

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

# Create the captioning model (LSTM based model)
def create_captioning_model(vocab_size, max_length, feature_vector_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 256, input_length=max_length))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(256))
    model.add(Dense(256, activation='relu'))
    model.add(Add())
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    return model

# Main Captioning Generation Pipeline
def generate_caption(image_path, model, tokenizer, max_length):
    feature = extract_features(image_path, feature_model)
    # Prepare the input sequence for prediction
    caption_input = [tokenizer.word_index['startseq']]  # Add start token
    for i in range(max_length):
        sequence = pad_sequences([caption_input], maxlen=max_length)
        prediction = model.predict([feature, sequence], verbose=0)
        predicted_index = np.argmax(prediction)
        predicted_word = tokenizer.index_word[predicted_index]
        if predicted_word == 'endseq':
            break
        caption_input.append(predicted_index)
    caption = ' '.join([tokenizer.index_word[idx] for idx in caption_input[1:]])  # Exclude 'startseq'
    return caption

# Prepare your image and caption dataset
# For demonstration, let's assume you have a dataset of images and their corresponding captions in 'image_caption_dataset.csv'.
# The dataset file should have two columns: 'image' (image path) and 'caption' (caption text)

dataset_path = 'path_to_image_caption_dataset.csv'
df = pd.read_csv(dataset_path)

# Extract image features using ResNet50
image_paths = df['image'].values
captions = df['caption'].values

# Load ResNet50 model pre-trained on ImageNet
feature_model = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
feature_model = Sequential([feature_model, Flatten()])  # Flatten the output for easier handling

# Extract image features
features = {}
for image_path in image_paths:
    feature = extract_features(image_path, feature_model)
    features[image_path] = feature

# Fit the tokenizer on captions
tokenizer = fit_tokenizer(captions)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding index

# Determine the maximum length of captions
max_length = max([len(caption.split()) for caption in captions])

# Create the captioning model
captioning_model = create_captioning_model(vocab_size, max_length, 2048)

# Split the dataset into train and test sets
train_images, test_images, train_captions, test_captions = train_test_split(image_paths, captions, test_size=0.2)

# Train the captioning model (This part is quite simplified, and requires fine-tuning)
# captioning_model.fit(x_train, y_train)  # You will need to prepare the inputs and targets for training

# Generate caption for a given image
sample_image_path = 'sample_image.jpg'
generated_caption = generate_caption(sample_image_path, captioning_model, tokenizer, max_length)

print(f"Generated Caption for {sample_image_path}: {generated_caption}")


ModuleNotFoundError: No module named 'keras.preprocessing.text'

In [None]:
!pip install tensorflow keras




In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer


In [None]:
from tensorflow.keras.layers import Flatten


In [None]:
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Add, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from PIL import Image

# Function to load and process the image
def preprocess_image(image_path):
    # Load the image with size 224x224 (ResNet50 expects this input size)
    image = load_img(image_path, target_size=(224, 224))
    # Convert the image to a numpy array
    image = img_to_array(image)
    # Rescale the image (ResNet50 model expects input scaled by 1/255)
    image = image / 255.0
    # Expand dimensions to match ResNet50 input shape
    image = np.expand_dims(image, axis=0)
    return image

# Load pre-trained ResNet50 for feature extraction
def extract_features(image_path, model):
    image = preprocess_image(image_path)
    # Extract features using the pre-trained ResNet50 model
    feature = model.predict(image)
    return feature

# Prepare the tokenizer for caption generation
def fit_tokenizer(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

# Create the captioning model (LSTM based model)
def create_captioning_model(vocab_size, max_length, feature_vector_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 256, input_length=max_length))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(256))
    model.add(Dense(256, activation='relu'))
    model.add(Add())
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    return model

# Main Captioning Generation Pipeline
def generate_caption(image_path, model, tokenizer, max_length):
    feature = extract_features(image_path, feature_model)
    # Prepare the input sequence for prediction
    caption_input = [tokenizer.word_index['startseq']]  # Add start token
    for i in range(max_length):
        sequence = pad_sequences([caption_input], maxlen=max_length)
        prediction = model.predict([feature, sequence], verbose=0)
        predicted_index = np.argmax(prediction)
        predicted_word = tokenizer.index_word[predicted_index]
        if predicted_word == 'endseq':
            break
        caption_input.append(predicted_index)
    caption = ' '.join([tokenizer.index_word[idx] for idx in caption_input[1:]])  # Exclude 'startseq'
    return caption

# Prepare your image and caption dataset
# For demonstration, let's assume you have a dataset of images and their corresponding captions in 'image_caption_dataset.csv'.
# The dataset file should have two columns: 'image' (image path) and 'caption' (caption text)

dataset_path = 'path_to_image_caption_dataset.csv'
dataset_path = 'image_caption_dataset.csv'  # Update with the correct path to your CSV file

# Check if the dataset file exists before reading it
if os.path.exists(dataset_path):
    df = pd.read_csv(dataset_path)
else:
    raise FileNotFoundError(f"The dataset file at '{dataset_path}' does not exist.")

# Extract image features using ResNet50
image_paths = df['image'].values
captions = df['caption'].values

# Load ResNet50 model pre-trained on ImageNet
feature_model = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
feature_model = Sequential([feature_model, Flatten()])  # Flatten the output for easier handling

# Extract image features
features = {}
for image_path in image_paths:
    feature = extract_features(image_path, feature_model)
    features[image_path] = feature

# Fit the tokenizer on captions
tokenizer = fit_tokenizer(captions)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding index

# Determine the maximum length of captions
max_length = max([len(caption.split()) for caption in captions])

# Create the captioning model
captioning_model = create_captioning_model(vocab_size, max_length, 2048)

# Split the dataset into train and test sets
train_images, test_images, train_captions, test_captions = train_test_split(image_paths, captions, test_size=0.2)

# Train the captioning model (This part is quite simplified, and requires fine-tuning)
# captioning_model.fit(x_train, y_train)  # You will need to prepare the inputs and targets for training

# Generate caption for a given image
sample_image_path = 'sample_image.jpg'
generated_caption = generate_caption(sample_image_path, captioning_model, tokenizer, max_length)

print(f"Generated Caption for {sample_image_path}: {generated_caption}")


FileNotFoundError: The dataset file at 'image_caption_dataset.csv' does not exist.

In [None]:
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Add, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from PIL import Image

# Function to load and process the image
def preprocess_image(image_path):
    # Load the image with size 224x224 (ResNet50 expects this input size)
    image = load_img(image_path, target_size=(224, 224))
    # Convert the image to a numpy array
    image = img_to_array(image)
    # Rescale the image (ResNet50 model expects input scaled by 1/255)
    image = image / 255.0
    # Expand dimensions to match ResNet50 input shape
    image = np.expand_dims(image, axis=0)
    return image

# Load pre-trained ResNet50 for feature extraction
def extract_features(image_path, model):
    image = preprocess_image(image_path)
    # Extract features using the pre-trained ResNet50 model
    feature = model.predict(image)
    return feature

# Prepare the tokenizer for caption generation
def fit_tokenizer(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

# Create the captioning model (LSTM based model)
def create_captioning_model(vocab_size, max_length, feature_vector_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 256, input_length=max_length))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(256))
    model.add(Dense(256, activation='relu'))
    model.add(Add())
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    return model

# Main Captioning Generation Pipeline
def generate_caption(image_path, model, tokenizer, max_length):
    feature = extract_features(image_path, feature_model)
    # Prepare the input sequence for prediction
    caption_input = [tokenizer.word_index['startseq']]  # Add start token
    for i in range(max_length):
        sequence = pad_sequences([caption_input], maxlen=max_length)
        prediction = model.predict([feature, sequence], verbose=0)
        predicted_index = np.argmax(prediction)
        predicted_word = tokenizer.index_word[predicted_index]
        if predicted_word == 'endseq':
            break
        caption_input.append(predicted_index)
    caption = ' '.join([tokenizer.index_word[idx] for idx in caption_input[1:]])  # Exclude 'startseq'
    return caption

# Prepare your image and caption dataset
dataset_path = 'image_caption_dataset.csv'  # Ensure the file exists in the same directory as the script

# Check if the dataset file exists before reading it
import os

# Debugging: Print the current working directory
print("Current Working Directory:", os.getcwd())

# Prepare your image and caption dataset
dataset_path = 'image_caption_dataset.csv'  # Update with the correct path

# Check if the dataset file exists before reading it
if os.path.exists(dataset_path):
    df = pd.read_csv(dataset_path)
else:
    dataset_path = 'C:/path/to/your/dataset/image_caption_dataset.csv'

image_paths = df['image'].values
captions = df['caption'].values

# Load ResNet50 model pre-trained on ImageNet
feature_model = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
feature_model = Sequential([feature_model, Flatten()])  # Flatten the output for easier handling

# Extract image features
features = {}
for image_path in image_paths:
    feature = extract_features(image_path, feature_model)
    features[image_path] = feature

# Fit the tokenizer on captions
tokenizer = fit_tokenizer(captions)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding index

# Determine the maximum length of captions
max_length = max([len(caption.split()) for caption in captions])

# Create the captioning model
captioning_model = create_captioning_model(vocab_size, max_length, 2048)

# Split the dataset into train and test sets
train_images, test_images, train_captions, test_captions = train_test_split(image_paths, captions, test_size=0.2)

# Train the captioning model (This part is quite simplified, and requires fine-tuning)
# captioning_model.fit(x_train, y_train)  # You will need to prepare the inputs and targets for training

# Generate caption for a given image
sample_image_path = 'sample_image.jpg'
generated_caption = generate_caption(sample_image_path, captioning_model, tokenizer, max_length)

print(f"Generated Caption for {sample_image_path}: {generated_caption}")



Current Working Directory: /content


NameError: name 'df' is not defined

In [None]:
import numpy as np
import pandas as pd
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Add, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from PIL import Image

# Function to load and process the image
def preprocess_image(image_path):
    # Load the image with size 224x224 (ResNet50 expects this input size)
    image = load_img(image_path, target_size=(224, 224))
    # Convert the image to a numpy array
    image = img_to_array(image)
    # Rescale the image (ResNet50 model expects input scaled by 1/255)
    image = image / 255.0
    # Expand dimensions to match ResNet50 input shape
    image = np.expand_dims(image, axis=0)
    return image

# Load pre-trained ResNet50 for feature extraction
def extract_features(image_path, model):
    image = preprocess_image(image_path)
    # Extract features using the pre-trained ResNet50 model
    feature = model.predict(image)
    return feature

# Prepare the tokenizer for caption generation
def fit_tokenizer(captions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(captions)
    return tokenizer

# Create the captioning model (LSTM based model)
def create_captioning_model(vocab_size, max_length, feature_vector_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 256, input_length=max_length))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(256))
    model.add(Dense(256, activation='relu'))
    model.add(Add())
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    return model

# Main Captioning Generation Pipeline
def generate_caption(image_path, model, tokenizer, max_length):
    feature = extract_features(image_path, feature_model)
    # Prepare the input sequence for prediction
    caption_input = [tokenizer.word_index['startseq']]  # Add start token
    for i in range(max_length):
        sequence = pad_sequences([caption_input], maxlen=max_length)
        prediction = model.predict([feature, sequence], verbose=0)
        predicted_index = np.argmax(prediction)
        predicted_word = tokenizer.index_word[predicted_index]
        if predicted_word == 'endseq':
            break
        caption_input.append(predicted_index)
    caption = ' '.join([tokenizer.index_word[idx] for idx in caption_input[1:]])  # Exclude 'startseq'
    return caption

# Prepare your image and caption dataset
dataset_path = 'image_caption_dataset.csv'  # Ensure the file exists in the same directory as the script

# Debugging: Print the current working directory
print("Current Working Directory:", os.getcwd())

# Check if the dataset file exists before reading it
import os

# Debugging: Print the current working directory
print("Current Working Directory:", os.getcwd())

# Prepare your image and caption dataset
dataset_path = 'image_caption_dataset.csv'  # Update with the correct path

# Check if the dataset file exists before reading it
if os.path.exists(dataset_path):
    df = pd.read_csv(dataset_path)
else:
      dataset_path = 'C:/path/to/your/dataset/image_caption_dataset.csv'


# Now that `df` is defined, proceed with the image and caption extraction
image_paths = df['image'].values
captions = df['caption'].values

# Load ResNet50 model pre-trained on ImageNet
feature_model = ResNet50(include_top=False, weights='imagenet', input_shape=(224, 224, 3))
feature_model = Sequential([feature_model, Flatten()])  # Flatten the output for easier handling

# Extract image features
features = {}
for image_path in image_paths:
    feature = extract_features(image_path, feature_model)
    features[image_path] = feature

# Fit the tokenizer on captions
tokenizer = fit_tokenizer(captions)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding index

# Determine the maximum length of captions
max_length = max([len(caption.split()) for caption in captions])

# Create the captioning model
captioning_model = create_captioning_model(vocab_size, max_length, 2048)

# Split the dataset into train and test sets
train_images, test_images, train_captions, test_captions = train_test_split(image_paths, captions, test_size=0.2)

# Train the captioning model (This part is quite simplified, and requires fine-tuning)
# captioning_model.fit(x_train, y_train)  # You will need to prepare the inputs and targets for training

# Generate caption for a given image
sample_image_path = 'sample_image.jpg'
generated_caption = generate_caption(sample_image_path, captioning_model, tokenizer, max_length)

print(f"Generated Caption for {sample_image_path}: {generated_caption}")


Current Working Directory: /content
Current Working Directory: /content


NameError: name 'df' is not defined