# **"Actor vendor"**

## ***PART 1 - Data collect***


> This code retrieves information about actors from Wikidata using a SPARQL query, downloads their images, extracts EXIF metadata and dominant colors from the images, and stores all the information in a JSON file for further analysis.



In [None]:
# Installation
!pip install sparqlwrapper
!pip install ipywidgets
!pip install numpy
!pip install scikit-learn
!pip install exifread

In [None]:
import sys
import os
import requests
import shutil
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import matplotlib.colors as mcolors
from SPARQLWrapper import SPARQLWrapper, JSON
from PIL import Image
from PIL.ExifTags import TAGS
from sklearn.cluster import KMeans

endpoint_url = "https://query.wikidata.org/sparql"

print("Query asked ...")

# Query to get actor
query = """SELECT ?actor ?actorLabel ?image ?height ?hairColor ?filmStyle ?nationality ?placeOfBirth ?careerStart ?dateOfDeath WHERE {
  ?actor wdt:P21 wd:Q6581097;                        # Men
          wdt:P106 wd:Q33999;                         # Actor
          wdt:P18 ?image.                             # Image
  OPTIONAL { ?actor wdt:P570 ?dateOfDeath }          # Death
  FILTER(!BOUND(?dateOfDeath))                        # Alive
  FILTER EXISTS { ?actor wdt:P18 ?image }            # Only with an image
  FILTER(STRENDS(str(?image), ".jpg"))                # Only jpg
  OPTIONAL { ?actor wdt:P2048 ?height. }             # Height
  OPTIONAL { ?actor wdt:P1884 ?hairColor. }          # Hair color
  OPTIONAL { ?actor wdt:P2515 ?filmStyle. }          # Kind of film
  OPTIONAL { ?actor wdt:P27 ?nationality. }          # Nationality
  OPTIONAL { ?actor wdt:P19 ?placeOfBirth. }         # Where he was born
  OPTIONAL { ?actor wdt:P2031 ?careerStart. }        # Year he debuted
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
LIMIT 100"""

# Get result of the query
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],)
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

array = []
results = get_results(endpoint_url, query)

# Format the result
for result in results["results"]["bindings"]:
    array.append((result["actorLabel"]["value"], result["image"]["value"]))
dataframe = pd.DataFrame(array, columns=["Actor", "Image"])
dataframe = dataframe.astype(dtype={"Actor": "<U200", "Image": "<U200"})

print("Query responded and formatted")

# Where we will be putting our images
output_folder = "images/"

# Open our file imagesData.json
if not os.path.isdir(os.path.join(output_folder)): os.mkdir(output_folder)
f = open("./imagesData.json", "a")

print("Downloading images ...")


def simplify_color(rgb):
    r, g, b = rgb
    max_val = max(r, g, b)
    if max_val == r:
        return 'red'
    elif max_val == g:
        return 'green'
    elif max_val == b:
        return 'blue'
    return 'Other'


# Download images onto our output folder
def download_image(url, index):
    headers = {"User-Agent": "Mozilla/5.0"}
    request = requests.get(url, allow_redirects=True, headers=headers, stream=True)
    if request.status_code == 200:
        extension = url.split(".")[-1]
        filename = os.path.join(output_folder, f"image_{index}.{extension}") # How we want to name our file
        with open(filename, "wb") as image:
            request.raw.decode_content = True
            shutil.copyfileobj(request.raw, image)
        imgfile = Image.open(filename)
        exif_data = imgfile._getexif()
        img_exif_data = {}
        if exif_data: # Only if there is exif data
            local_exif_data_dict = {}
            for tag, value in exif_data.items():
                if tag in TAGS:
                    if isinstance(value, bytes):
                      try:
                        value = value.decode('UTF-8')
                      except Exception :
                        value = "Error in variable encoding"
                    local_exif_data_dict[TAGS[tag]] = str(value)
        img_exif_data[filename] = local_exif_data_dict if exif_data else None
    return request.status_code, img_exif_data

all_exif_data = []

for index, image_url in enumerate(dataframe.Image):
    code, exif = download_image(image_url, index)
    if not exif:  # If no EXIF data is available
        exif = {"Favorite": "NotFavorite", "DominantColor1": "Undefined"}
    all_exif_data.append(exif)

    # Add code for dominant color here (after checking EXIF data)

# Write exif data onto the json
with open("./imagesData.json", "w") as f:
    json.dump(all_exif_data, f)

print("JSON written")



print("Dominant color begin ...")

# Dominant color
n = 1  # How many dominant color we want
default_color = (255)
image_folder = "./images/"
image_files = os.listdir(image_folder)
json_file = "./imagesData.json"

for idx, image_file in enumerate(image_files):
    if image_file.endswith(('.jpg', '.jpeg')):
        image_path = os.path.join(image_folder, image_file)
        img = Image.open(image_path)
        img = img.convert("RGB")
        np_img = np.array(img)

        numarray = np.array(img.getdata(), np.uint8)
        clusters = KMeans(n_clusters=n, n_init=2)
        clusters.fit(numarray)
        cluster_centers = clusters.cluster_centers_
        colors_list = [tuple(map(int, cluster_centers[i])) for i in range(n)]

        simplified_colors_list = [simplify_color(color) for color in colors_list]  # Simplify colors

        # Update the JSON data part as below
        with open(json_file, 'r+') as f:
            data = json.load(f)
            image_key = f"images/image_{idx}.jpg"
            if data[idx].get(image_key):
                for i, color in enumerate(simplified_colors_list, start=1):
                    data[idx][image_key][f"Favorite"] = "NotFavorite"
                    data[idx][image_key][f"DominantColor{i}"] = color
                f.seek(0)
                json.dump(data, f, indent=4)
                f.truncate()


print("Dominant color done")


##***PART 2 AND 3 - Labeling, annotation, and analysis***

> This code creates a grid layout using the ipywidgets library to display images, checkboxes, and text input fields. It chooses only the 20 first downloaded images. It allows users to mark images as favorites and add tags to them. Upon clicking the "Select" button, the selected images' metadata, including their favorite status and tags, are updated in JSON files.  


What you need to do :
* Check your favorite images so that on the code below, new images will be recommended to you




In [None]:
import os
import json
from ipywidgets import GridspecLayout, Image, Checkbox, Text, Button, Output
# Get paths to images in the folder
image_folder = "./images"
image_paths = [os.path.join(image_folder, f'image_{i}.jpg') for i in range(20)]

# Define JSON file
json_file = "./imagesData.json"

# Read JSON file to get data
with open(json_file, 'r') as f:
    data = json.load(f)

# Filter images based on the "Favorite" tag
favorite_image_paths = [entry for entry in data if entry and isinstance(entry, dict)]
num_images = min(len(favorite_image_paths), 20)

# Generate widgets for each image
checkboxes = [Checkbox(value=False, description='Favorite') for _ in range(num_images)]
tag_inputs = [Text(placeholder='Enter tag', description='Tag') for _ in range(num_images)]

# Calculate the number of columns for the grid
num_columns = 1

# Calculate the number of rows needed based on the total number of images and the number of columns
num_rows = (num_images + num_columns - 1) // num_columns

# Create the layout grid
layout = GridspecLayout(n_columns=num_columns * 3, n_rows=num_rows, width='1300px')

def get_selected_images(btn):
    global labels  # Utilisez global pour modifier la variable labels globale
    labels = []  # Réinitialiser la liste des étiquettes à chaque fois que le bouton est cliqué

    with open(json_file, 'r+') as f:
        data = json.load(f)
        for i, (checkbox, tag_input) in enumerate(zip(checkboxes, tag_inputs)):
            is_favorite = checkbox.value
            tag = tag_input.value
            image_key = f"images/image_{i}.jpg"
            if is_favorite:
                test = "Favorite"
                labels.append("Favorite")
            else:
                test = "NotFavorite"
                labels.append("NotFavorite")
            if i < len(data):
                if data[i].get(image_key):
                    if data[i][image_key]["Favorite"] == "NotFavorite":
                        data[i][image_key]["Favorite"] = test
                        data[i][image_key]["Tag"] = tag
                        continue
    for i in range(80):
        labels.append("NotFavorite")

    with open(json_file, 'w') as f:
        json.dump(data, f, indent=4)

    print("Data written")

def get_reset(btn):
    result = []
    with open(json_file, 'r+') as f:
        data = json.load(f)
        for i, (checkbox, tag_input) in enumerate(zip(checkboxes, tag_inputs)):
            image_key = f"images/image_{i}.jpg"

            if i < len(data):
                if data[i].get(image_key):
                    if data[i][image_key]["Favorite"] == "Favorite":
                        data[i][image_key]["Favorite"] = "NotFavorite"
                        data[i][image_key]["Tag"] = ""
                        continue

    with open(json_file, 'w') as f:
        json.dump(data, f, indent=4)
    print("Data written")

# Button to select images
buttonAdd = Button(description="Select")
buttonReset = Button(description="Reset")

# Output to display results
output = Output()

# Fill the grid with images, checkboxes, and text fields
for i in range(num_images):
    image_entry = favorite_image_paths[i]
    image_path = list(image_entry.keys())[0]
    with open(image_path, "rb") as file:
        image = file.read()
    image_widget = Image(
        value=image,
        format='jpg',
        width=100,
        height=100,
    )
    row = i // num_columns
    col = (i % num_columns)
    layout[row, col] = image_widget
    layout[row, col + 1] = checkboxes[i]
    layout[row, col + 2] = tag_inputs[i]

# Link button click to function
buttonAdd.on_click(get_selected_images)
buttonReset.on_click(get_reset)

# Display the grid, button, and output
display(layout, buttonAdd, buttonReset, output)


## ***PART 4 - Data vizualisation***

> This code reads image metadata from a JSON file, extracts relevant information such as image sizes, orientations, camera models, and colors. It then visualizes the extracted data.




In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt

with open("./imagesData.json") as f:
    data = json.load(f)

image_data = []
image_sizes = []
orientations = []
colors = []
camera_models = []
favorite = []

# Extract exif data
for item in data:
    for image_path, exif_data in item.items():
        if exif_data:
            # Size
            if 'ExifImageWidth' in exif_data and 'ExifImageHeight' in exif_data:
                image_sizes.append((exif_data['ExifImageWidth'], exif_data['ExifImageHeight']))

            # Orientation
            if 'Orientation' in exif_data:
                orientations.append(exif_data['Orientation'])

            # Camera model
            if 'Model' in exif_data:
                camera_models.append(exif_data['Model'])

                # Favorite
            if 'Favorite' in exif_data:
                favorite.append(exif_data['Favorite'])

            # Year
            if 'DateTimeOriginal' in exif_data:
                image_data.append(exif_data['DateTimeOriginal'][:4])

            # Colors
            for color_key in ['DominantColor1', 'DominantColor2', 'DominantColor3']:
                if color_key in exif_data:
                    colors.append(exif_data[color_key])  # Directly append the RGB values

# Convert and clean up
df = pd.DataFrame({'DateTimeOriginal': pd.to_datetime(image_data, format='%Y')})
sizes_df = pd.DataFrame(image_sizes, columns=['Width', 'Height'])
orientations_df = pd.DataFrame(orientations, columns=['Orientation'])
camera_models_df = pd.DataFrame(camera_models, columns=['Camera Model'])
favorite_df = pd.DataFrame(favorite, columns=['Favorite'])
color_df = pd.DataFrame(colors, columns=['Colors'])

# Grouping
grouped = df.groupby(df['DateTimeOriginal'].dt.year).size()
sizes_count = sizes_df.groupby(['Width', 'Height']).size()
orientations_count = orientations_df.groupby(['Orientation']).size()
camera_models_count = camera_models_df.groupby(['Camera Model']).size()
favorite_count = favorite_df.groupby(['Favorite']).size()
colors_count = color_df.groupby(['Colors']).size()

# Plotting
plt.figure(figsize=(25, 15))

# Image sizes
plt.subplot(2, 3, 1)
sizes_count.plot(kind='bar', title='Number of images per size')
plt.xticks(rotation=45)
plt.tight_layout()

# Orientations
plt.subplot(2, 3, 2)
orientations_count.plot(kind='bar', title='Number of images per orientation')
plt.xticks(rotation=45)
plt.tight_layout()

# Camera models
plt.subplot(2, 3, 3)
camera_models_count.plot(kind='bar', title='Number of images per camera model')
plt.xticks(rotation=90)
plt.tight_layout()

# Images per year
plt.subplot(2, 3, 4)
grouped.plot(kind='bar', title='Number of images per year')
plt.xticks(rotation=45)
plt.tight_layout()

# Colors
plt.subplot(2, 3, 5)
colors_count.plot(kind='bar', title='Number of images per dominant color', xlabel='Dominant color', ylabel='Number')
plt.xticks(rotation=45)
plt.tight_layout()

# Favorite
plt.subplot(2, 3, 6)
favorite_count.plot(kind='bar', title='Number of favorite images')
plt.xticks(rotation=90)
plt.tight_layout()

plt.show()


## ***PART 5 - RECOMMENDATION***



> This code loads image metadata from our JSON file, identifies a favorite image, performs K-means clustering to group similar images, recommends similar items based on the favorite image.




In [None]:
import json
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import adjusted_rand_score, homogeneity_completeness_v_measure, silhouette_score
import matplotlib.pyplot as plt
from IPython.display import Image, display
import ipywidgets as widgets

# Load image data from JSON file
with open("./imagesData.json") as f:
    image_data = json.load(f)

# Find the image marked as "Favorite"
favorite_image_data = None
for item in image_data:
    for image_path, exif_data in item.items():
        if exif_data and 'Favorite' in exif_data and exif_data['Favorite'] == "Favorite":
            favorite_image_data = exif_data
            break

if favorite_image_data is None:
    print("No favorite image found.")
    exit()

data = []

# Extract exif data
for item in image_data:
    for image_path, exif_data in item.items():
        info_to_train_list = []  # Create a new list for each image

        # Orientation
        orientation_default = ""
        if exif_data:
            info_to_train_list.append(exif_data.get('Orientation', orientation_default))
        else:
            info_to_train_list.append(orientation_default)
        # Camera model
        model_default = ""
        if exif_data:
            info_to_train_list.append(exif_data.get('Model', model_default))
        else:
            info_to_train_list.append(model_default)
        # Year
        year_default = ""
        if exif_data:
            info_to_train_list.append(exif_data.get('DateTimeOriginal', "")[:4] or year_default)
        else:
            info_to_train_list.append(year_default)
        # Colors
        color_default = ""
        if exif_data:
            info_to_train_list.append(exif_data.get('DominantColor1', color_default))  # Use default if key not present
        else:
            info_to_train_list.append(color_default)

        # Add the extracted information for the current image to data
        data.append(info_to_train_list)

# Encode categorical features and labels
label_encoders = [LabelEncoder() for _ in range(len(data[0]))]
encoded_data = []
for i, column in enumerate(zip(*data)):
    encoded_data.append(label_encoders[i].fit_transform(column))

X = list(zip(*encoded_data))  # Features
y = labels  # Labels

# Clustering
k = 7  # Number of clusters
kmeans = KMeans(n_clusters=k, n_init=10)
kmeans.fit(X)
clusters = kmeans.labels_

# Add the cluster labels to the original data
data_with_clusters = pd.DataFrame(data, columns=["Orientation", "CameraModel", "Year", "DominantColor"])
data_with_clusters["Cluster"] = clusters

# Recommendation function
def recommend_items(cluster, data_with_clusters):
    items_in_cluster = data_with_clusters[data_with_clusters["Cluster"] == cluster]
    recommended_items = items_in_cluster.sample(n=3)  # Sample 3 items from the cluster
    return recommended_items

# Use favorite image data for recommendation
encoded_item = []
for i, key in enumerate(['Orientation', 'Model', 'DateTimeOriginal', 'DominantColor1']):
    if key in favorite_image_data:
        val = favorite_image_data[key]
        encoded_item.append(label_encoders[i].transform([val])[0])
    else:
        # Handle the case where the key is missing
        # For instance, you could append a default value or skip this item
        if key == 'Orientation':
            encoded_item.append("0")
        elif key == 'Model':
            encoded_item.append("0")
        elif key == 'DateTimeOriginal':
            encoded_item.append("2000")
        elif key == 'DominantColor1':
            encoded_item.append("Undefined")

# Now proceed with prediction using encoded_item
# Predict the cluster for the favorite image
cluster = kmeans.predict([encoded_item])[0]

# Use the predicted cluster for recommendation
recommendations = recommend_items(cluster, data_with_clusters)

# Display favorite image data
print("Favorite image data:")
print(favorite_image_data.get("Orientation", "N/A"))
print(favorite_image_data.get("Model", "N/A"))
print(favorite_image_data.get("DateTimeOriginal", "N/A")[:4])
print(favorite_image_data.get("DominantColor1", "N/A"))
print("----------------------------------------------------")
print("Recommended items based on favorite image:")
print(recommendations)
print("----------------------------------------------------")
print("Images:")

# Define num_images and num_columns
num_images = len(recommendations)

# Define layout
layout = widgets.GridBox(layout=widgets.Layout(grid_template_columns="repeat(3, auto)"))
image_paths = [key for item in image_data for key in item.keys()]

# Define layout
layout = widgets.GridBox(layout=widgets.Layout(grid_template_columns="repeat(3, auto)"))

for i, image_path in enumerate(image_paths[:num_images]):
    image_widget = widgets.Image(value=open(image_path, "rb").read(), format='jpg', width=100, height=100)
    row = i
    col = i
    layout.children += (image_widget,)

display(layout)


print("----------------------------------------------------")
print("Metrics")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression classifier
classifier = svm.SVC()
classifier.fit(X_train, y_train)

# Analyze important features (support vectors)
support_vectors = classifier.support_vectors_
dual_coef = classifier.dual_coef_[0]  # Dual coefficients of support vectors

# Calculate silhouette score
silhouette_avg = silhouette_score(X, clusters)
print("Silhouette Score:", silhouette_avg)

# Adjusted Rand Index
ari = adjusted_rand_score(labels, clusters)
print("Adjusted Rand Index:", ari)

# Homogeneity, Completeness, and V-measure
homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(labels, clusters)
print("Homogeneity:", homogeneity)
print("Completeness:", completeness)
print("V-measure:", v_measure)