# Machine Learning logic and code
The idea is to create a recomendation system where the user is presented to a randomized set of artworks from which they can put a 'like' on. 
From the liked artworks will emerge other artworks with connections to the previous (e.g. same author, timeframe, wing, etc).

However, since there is no user input, the machine learning model predicts whether an artwork from the MET is likely to be a highlight (i.e., a popular or significant piece) based on its attributes, such as medium, department, and historical context, serving as a baseline recommendation for users with no specific preferences. This model helps create a curated visit for individuals who want to experience the museum's most notable works.

In [None]:
import pandas as pd 
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
try:
    full_df = pd.read_csv('../data/clean/full_dataset.csv', low_memory=False)
except Exception as e:
    print(f"Error loading dataset: {e}")

# Helper function to process dates
def process_date(date):
    """Converts date strings to numeric by averaging start and end years."""
    if isinstance(date, str):
        try:
            date = date.lower().replace('ca.', '').strip()
            matches = re.findall(r'-?\d+', date)
            if len(matches) == 1:
                return int(matches[0])
            elif len(matches) == 2:
                return (int(matches[0]) + int(matches[1])) / 2
        except:
            return np.nan
    return np.nan

# Preprocessing numeric and date features
full_df['object_date_numeric'] = full_df['object_date'].apply(process_date)
full_df['artist_begin_date_numeric'] = full_df['artist_begin_date'].apply(process_date)
full_df['artist_end_date_numeric'] = full_df['artist_end_date'].apply(process_date)

numerical_features = ['object_date_numeric', 'artist_begin_date_numeric', 'artist_end_date_numeric']
categorical_features = ['medium', 'culture', 'period', 'classification', 'artist_nationality']
text_features = ['title', 'object_name', 'tags']  # Include more text features

# Preprocessing pipelines for numerical and categorical data
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for text features
text_pipeline = TfidfVectorizer(max_features=500)

# Fit transformers for each feature group
numerical_data = numerical_pipeline.fit_transform(full_df[numerical_features])
categorical_data = categorical_pipeline.fit_transform(full_df[categorical_features])
text_data = text_pipeline.fit_transform(full_df['tags'])  # Example: using 'tags' for text

# Concatenate the preprocessed data into a single matrix
from scipy.sparse import hstack
X_processed = hstack([numerical_data, categorical_data, text_data])

# Function to recommend artworks based on user preferences
def recommend_artworks(user_preferences, X_processed, full_df, top_n=5):
    """
    Recommend artworks based on user preferences and similarity.
    :param user_preferences: Dictionary of user inputs (e.g., style, artist, period).
    :param top_n: Number of recommendations to return.
    :return: DataFrame of recommended artworks.
    """
    # Ensure all expected columns are in user_preferences
    expected_columns = numerical_features + categorical_features + text_features
    user_preferences_full = {col: user_preferences.get(col, None) for col in expected_columns}

    # Convert to DataFrame
    user_df = pd.DataFrame([user_preferences_full])

    # Preprocess user input data (transform user input separately for each feature type)
    numerical_user_data = numerical_pipeline.transform(user_df[numerical_features])
    categorical_user_data = categorical_pipeline.transform(user_df[categorical_features])
    text_user_data = text_pipeline.transform(user_df['tags'])  # Example: using 'tags' for text

    # Concatenate the user data into a single vector
    user_vector = hstack([numerical_user_data, categorical_user_data, text_user_data])

    # Convert the user vector to dense format
    user_vector_dense = user_vector.toarray()

    # Convert all data vectors to dense format
    all_data_vectors_dense = X_processed.toarray()

    # Calculate cosine similarity between the user input and all artworks
    similarity_scores = cosine_similarity(user_vector_dense, all_data_vectors_dense)

    # Get top N recommendations
    top_indices = np.argsort(similarity_scores[0])[::-1][:top_n]
    return full_df.iloc[top_indices]

# Function to recommend similar artworks based on liked artworks
def recommend_similar_artworks(liked_artworks, all_data_vectors, full_df, top_n=5):
    """
    Recommend artworks based on similarity to the liked artworks.
    :param liked_artworks: DataFrame of artworks that the user likes.
    :param all_data_vectors: Preprocessed data vectors of all artworks (sparse matrix).
    :param full_df: Full DataFrame of artworks.
    :param top_n: Number of artworks to recommend.
    :return: DataFrame of recommended artworks.
    """
    liked_indices = liked_artworks.index

    # Initialize a set for all recommended artworks
    recommended_indices = set()

    # Convert the sparse matrix to dense array if needed for indexing
    all_data_vectors_dense = all_data_vectors.toarray()  # Convert sparse matrix to dense

    # Calculate similarity for each liked artwork
    for idx in liked_indices:
        # Reshape the vector for the current artwork to ensure it's 2D
        current_artwork_vector = all_data_vectors_dense[idx].reshape(1, -1)
        
        # Calculate similarity with all other artworks (dense format)
        similarities = cosine_similarity(current_artwork_vector, all_data_vectors_dense).flatten()
        
        # Get the top N similar artworks (excluding the artwork itself)
        top_similar = np.argsort(similarities)[::-1][1:top_n+1]  # Exclude itself (index 0)
        recommended_indices.update(top_similar)
    
    # Get the recommended artworks
    return full_df.iloc[list(recommended_indices)].head(top_n)

# Example user preferences (include all expected columns)
user_preferences = {
    'object_date_numeric': 1700,
    'artist_begin_date_numeric': None,
    'artist_end_date_numeric': None,
    'medium': 'Oil on canvas',
    'culture': 'European',
    'classification': None,
    'artist_nationality': None,
    'tags': 'flowers',
    'period': None,
    'title': None,
    'object_name': None
}

# Get initial recommendations based on user preferences
recommended_artworks = recommend_artworks(user_preferences, X_processed, full_df, top_n=5)

# Display the initial recommended artworks
print("Initial Recommended Artworks based on User Preferences:")
print(recommended_artworks[['title', 'medium', 'culture', 'gallery_number', 'tags', 'image_url']])

# Example: User likes certain artworks, so recommend similar ones
liked_artworks = recommended_artworks  # Assume the user liked these artworks
similar_artworks = recommend_similar_artworks(liked_artworks, X_processed, full_df, top_n=5)

# Display similar artworks
print("\nSimilar Artworks to Liked Pieces:")
print(similar_artworks[['title', 'medium', 'culture', 'gallery_number', 'tags', 'image_url']])


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from scipy.sparse import hstack
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
try:
    full_df = pd.read_csv('../data/clean/full_dataset.csv', low_memory=False)
except Exception as e:
    print(f"Error loading dataset: {e}")

# Helper function to process dates
def process_date(date):
    """Converts date strings to numeric by averaging start and end years."""
    if isinstance(date, str):
        try:
            date = date.lower().replace('ca.', '').strip()
            matches = re.findall(r'-?\d+', date)
            if len(matches) == 1:
                return int(matches[0])
            elif len(matches) == 2:
                return (int(matches[0]) + int(matches[1])) / 2
        except:
            return np.nan
    return np.nan

# Preprocessing numeric and date features
full_df['object_date_numeric'] = full_df['object_date'].apply(process_date)
full_df['artist_begin_date_numeric'] = full_df['artist_begin_date'].apply(process_date)
full_df['artist_end_date_numeric'] = full_df['artist_end_date'].apply(process_date)

numerical_features = ['object_date_numeric', 'artist_begin_date_numeric', 'artist_end_date_numeric']
categorical_features = ['medium', 'culture', 'period', 'classification', 'artist_nationality']
text_features = ['tags', 'artist_display_name']  

# Synthetic "liked" label creation (example rules based on your dataset)
full_df['liked'] = (
    (full_df['medium'] == 'Oil on canvas') |  # Liked if medium is Oil on canvas
    (full_df['culture'] == 'European') |  # Liked if culture is European
    (full_df['tags'].str.contains('flowers', na=False)) | # Liked if tags contain "flowers"
    (full_df['period'] == 'Classical period')  
).astype(int)  # Convert True/False to 1/0

# Preprocessing pipelines for numerical and categorical data
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for text features
text_pipeline = TfidfVectorizer(max_features=500, stop_words='english')

# Fit transformers for each feature group
numerical_data = numerical_pipeline.fit_transform(full_df[numerical_features])
categorical_data = categorical_pipeline.fit_transform(full_df[categorical_features])
text_data = text_pipeline.fit_transform(full_df['tags'])  # Example: using 'tags' for text

# Combine the processed data
X_processed = hstack([numerical_data, categorical_data, text_data])
y = full_df['liked']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight={0: 1, 1: 20},  
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

# Feature importance analysis (optional)
feature_importances = model.feature_importances_
print("\nTop Feature Importances:")
for idx, importance in enumerate(sorted(feature_importances, reverse=True)[:10]):
    print(f"Feature {idx + 1}: {importance:.4f}")

# Function to recommend artworks based on ML predictions
def recommend_artworks_ml(X_processed, full_df, model, top_n=5):
    """
    Recommend artworks based on the trained model predictions.
    :param X_processed: Preprocessed feature matrix.
    :param full_df: Full DataFrame of artworks.
    :param model: Trained ML model.
    :param top_n: Number of recommendations to return.
    :return: DataFrame of recommended artworks.
    """
    # Predict probabilities for all artworks
    probabilities = model.predict_proba(X_processed)[:, 1]  # Get probabilities for "liked" class

    # Get top N indices with highest probabilities
    top_indices = np.argsort(probabilities)[::-1][:top_n]

    # Return the top recommended artworks
    return full_df.iloc[top_indices]

# Get recommendations using the trained ML model
recommended_artworks_ml = recommend_artworks_ml(X_processed, full_df, model, top_n=5)

# Display recommended artworks
print("\nRecommended Artworks Based on ML Predictions:")
print(recommended_artworks_ml[['title', 'medium', 'culture', 'tags']])