In [100]:
!pip install tensorflow
!pip install tensorflow-hub
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


# Feature Extraction
256 Features were selected as the starting point to generate the various classification models

## Model

In [101]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras import regularizers

#variable placeholders for image dimesnions
img_height = 256
img_width = 256

#drop top
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(img_height, img_width, 3))

#freeze layers
for layer in base_model.layers:
    layer.trainable = False

#feature extraction at last layer
output = base_model.output
output = GlobalAveragePooling2D()(output) #global avg pooling layer to reduce spatial dimensions

#dense layer adds a fully connected layer with 256 units 
#and sets the ReLu activation function for the layer
#add a dropout of 0.5
output = Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.01))(output)
output = Dropout(0.5)(output)


#feature extraction model
feature_extraction_model = Model(inputs=base_model.input, outputs=output)

#model summary
print(feature_extraction_model.summary())

None


## Normalization of Input Images
With varying image sizes, the decision to resize the images to the smallest resolution present in the data was chosen.

In [102]:
from PIL import Image
import numpy as np

def preprocess_image(img_path):
    #target size of images256 x 256
    target_size = (256,256)

    img = Image.open(img_path)

    if img.size != target_size:
        img = img.resize(target_size, resample=Image.BICUBIC)

    #return the image array
    return (np.array(img))

## Directory Traversal

This section contains the traversal function that extracts features from every image in a directory, where the location is passed as a parameter. As a precaution it was not set to traverse the entire directory, due to time and computational limitations.

NB Modification made to the original feature extraction function as a generation class column and art style column was added to each row with the values being set in the parameters of the function

In [103]:
import os
import pandas as pd
from tensorflow.keras.applications.vgg16 import preprocess_input

def extract_features_from_folder(input_folder_path, generation_class, art_style):
    csv_file_path = os.path.join(input_folder_path + '.csv')

    with open(csv_file_path, 'a') as f:
        for filename in os.listdir(input_folder_path):
            img_path = os.path.join(input_folder_path, filename)
            if os.path.isfile(img_path):
                #start_time = time.time()

                img_array = preprocess_image(img_path)
                img_array = np.expand_dims(img_array, axis=0)
                img_array = preprocess_input(img_array)

                features = feature_extraction_model.predict(img_array)

                data_frame = pd.DataFrame(features.flatten()).transpose()
                data_frame.insert(0, 'generation_class', generation_class) 
                data_frame.insert(0, 'art_style', art_style) #adds art style first

                data_frame.to_csv(f, header=False, index=False)    
    f.close()

## Feature Extraction
As mentioned in previous sections, each directory was passed as a parameter and commented out after successful extraction over multiple sessions

### Feature Extraction by Art Style 

#### Art Style Nouveau

In [104]:
#extract_features_from_folder('train/AI_LD_art_nouveau', 'AI_LD', 'art_nouveau')
#extract_features_from_folder('train/AI_SD_art_nouveau', 'AI_SD', 'art_nouveau')
#extract_features_from_folder('train/art_nouveau', 'HU', 'art_noveau')

### Experimentation and Binary Model Selection

#### Generating a Uniform Sample and Test Dataset
To maintain consistency 2 datasets were created to train and test through all models

This function gets all the csvs generated from feature extraction and enters them into a single data frame and returns it

In [105]:
import os
import pandas as pd

def gen_data_set_from_csvs(path):
    #list all files in the directory
    files = os.listdir(path)
    
    #filter s
    csvs = [file for file in files if file.endswith(".csv")]
    
    #init a df to hold all
    all_df = []
    
    #iterate csvs
    for csv in csvs:
        #read csv into a temp df
        temp_df = pd.read_csv(os.path.join(path, csv), header=None)
        
        #assign labels
        temp_df.columns = ["art_style", "gen_type"] + [f"F_{i}" for i in range(1, len(temp_df.columns)-1)]
        
        #append temp df to final df
        all_df.append(temp_df)
    
    #
    concat_df = pd.concat(all_df, ignore_index=True)
    
    print(len(concat_df))
    print(concat_df['gen_type'].value_counts())
    
    return concat_df

In [106]:
#get all the generated csvs and create datafram
data_set = gen_data_set_from_csvs('train/')

15376
gen_type
AI_SD    5384
HU       5000
AI_LD    4992
Name: count, dtype: int64


Create all function to generate the train and test data

In [107]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

def create_train_test_sets(df, style, bin_lab_1, bin_lab_2, imp_meth, samp_size, test_ratio, random_state=None):    
    #handle NaN in df
    nan_values = df.isna().any().any()

    if nan_values:
        #handle NaNs using SimpleImputer excluding the label column
        numeric_features = df.select_dtypes(include=['number']).columns
        imputer = SimpleImputer(strategy=imp_meth)
        df[numeric_features] = imputer.fit_transform(df[numeric_features])
    else:
        print("No NaN values found.")

    #filter the df based on labels
    df_filt_by_style = df[df['art_style'] == style]
    print(df_filt_by_style['gen_type'].value_counts())

    samps_for_each = samp_size//2

    #filter df to X_train and y_train
    # Sample an equal number of samples from each group
    df_bin_1 = df_filt_by_style[df_filt_by_style['gen_type'] == bin_lab_1].sample(n=samps_for_each, random_state=random_state)
    df_bin_2 = df_filt_by_style[df_filt_by_style['gen_type'] == bin_lab_2].sample(n=samps_for_each, random_state=random_state)


    #combin the filtered dfs
    df_fused = pd.concat([df_bin_1.reset_index(drop=True), df_bin_2.reset_index(drop=True)])

    #create binary labels 1 human 0 ai
    df_fused['binary_label'] = df_fused['gen_type'].apply(lambda x: 1 if x == bin_lab_1 else 0)

    #split into training and testing
    X = df_fused.drop(columns=['art_style', 'gen_type', 'binary_label'])
    y = df_fused['binary_label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state)

    # Print counts
    print("Training samples count:")
    print(y_train.value_counts())
    print("\nTesting samples count:")
    print(y_test.value_counts())

    return X_train, X_test, y_train, y_test


In [122]:
X_train, X_test, y_train, y_test = create_train_test_sets(data_set, 'art_nouveau', 'AI_SD', "HU", 'mean', 9500, 0.3)
print (X_train.describe())
print (X_test.describe())
print (y_train.describe())
print (y_test.describe())

No NaN values found.
gen_type
AI_SD    5384
HU       5000
AI_LD    4992
Name: count, dtype: int64
Training samples count:
binary_label
1    3342
0    3308
Name: count, dtype: int64

Testing samples count:
binary_label
0    1442
1    1408
Name: count, dtype: int64
               F_1          F_2          F_3          F_4          F_5  \
count  6650.000000  6650.000000  6650.000000  6650.000000  6650.000000   
mean      0.429646     1.664149     0.459108     0.252143     0.133277   
std       1.114030     2.256538     1.096192     0.864126     0.540079   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       0.000000     0.000000     0.000000     0.000000     0.000000   
50%       0.000000     0.000000     0.000000     0.000000     0.000000   
75%       0.000000     3.061466     0.168694     0.000000     0.000000   
max      10.493566    12.862696    11.044848     9.749878     6.845611   

               F_6          F_7          F_8          F_9         F_1

In [128]:
X_train, X_test, y_train, y_test = create_train_test_sets(data_set, 'art_nouveau', 'AI_SD', "HU", 'mean', 8000, 0.3)

No NaN values found.
gen_type
AI_SD    5384
HU       5000
AI_LD    4992
Name: count, dtype: int64
Training samples count:
binary_label
1    2813
0    2787
Name: count, dtype: int64

Testing samples count:
binary_label
0    1213
1    1187
Name: count, dtype: int64


Now taht we have training and testing sets we can evaluate various models and methods to decide

In [129]:
import time#imported to measure time to train
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_predict

##### SVM

In [131]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
import time

def test_svm(X_train, X_test, y_train, y_test, cv):
    # Initialize SVM classifier
    svm_classifier = SVC()

    # Measure start time
    start_time = time.time()
    svm_classifier.fit(X_train, y_train)
    training_time = time.time() - start_time

    # Make predictions on test set
    y_pred = svm_classifier.predict(X_test)

    # Perform k-fold cross-validation
    accuracy_scores = cross_val_score(svm_classifier, X_train, y_train, cv=cv)

    # Calculate mean accuracy
    mean_accuracy = accuracy_scores.mean()
    
    # Calculate precision, recall, and F1-score using cross-validated predictions
    precision_cv = precision_score(y_train, svm_classifier.predict(X_train))
    recall_cv = recall_score(y_train, svm_classifier.predict(X_train))
    f1_score_cv = f1_score(y_train, svm_classifier.predict(X_train))
    
    # Calculate holdout accuracy
    test_accuracy = accuracy_score(y_test, y_pred)
    
    # Calculate confusion matrix and classification report
    confusion_mat = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Print results
    print('Training time:', training_time)
    print('Mean accuracy from cross-validation:', mean_accuracy)
    print('Holdout accuracy:', test_accuracy)
    print('Precision (CV):', precision_cv)
    print('Recall (CV):', recall_cv)
    print('F1 Score (CV):', f1_score_cv)
    print('Confusion Matrix:\n', confusion_mat)
    print('Classification Report:\n', classification_rep)


In [134]:
test_svm(X_train, X_test, y_train, y_test, 10)

Training time: 0.3561129570007324
Mean accuracy from cross-validation: 1.0
Holdout accuracy: 1.0
Precision (CV): 1.0
Recall (CV): 1.0
F1 Score (CV): 1.0
Confusion Matrix:
 [[1213    0]
 [   0 1187]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1213
           1       1.00      1.00      1.00      1187

    accuracy                           1.00      2400
   macro avg       1.00      1.00      1.00      2400
weighted avg       1.00      1.00      1.00      2400



## Feature Extraction II
Extracting the remaining art styles from various AI and human sources

### ukiyo_e

In [None]:
extract_features_from_folder('train/AI_LD_ukiyo-e', 'AI_LD', 'ukiyo_e')
extract_features_from_folder('train/AI_SD_ukiyo-e', 'AI_SD', 'ukiyo_e')
extract_features_from_folder('train/ukiyo_e', 'HU', 'ukiyo_e')