In [None]:
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif

# Load your datasets (replace 'features.npy' and 'classes.npy' with your actual filenames)
X = np.load("/mnt/c/users/admin/desktop/F21DL/smiley_dataset/smiley_X.npy")#noiseless dataset
y = np.load("/mnt/c/users/admin/desktop/F21DL/smiley_dataset/smiley_Y.npy")#noiseless dataset


# Reshape the 4D image features into a 2D array
num_samples, height, width, channels = X.shape
X_reshaped = X.reshape(num_samples, height * width * channels)

# Remove constant features
non_constant_feature_indices = np.where(X_reshaped.std(axis=0) != 0)[0]
X_non_constant = X_reshaped[:, non_constant_feature_indices]

# Number of top features to select
top_k_features_per_class = 3
top_k_features_total = top_k_features_per_class * len(np.unique(y))

# Feature Selection - SelectKBest with ANOVA F-value scoring
selector = SelectKBest(score_func=f_classif, k=top_k_features_total)
X_new = selector.fit_transform(X_non_constant, y)

# Get the indices of selected features
selected_feature_indices = non_constant_feature_indices[selector.get_support(indices=True)]

# Create Data set 1 with top three features per class
data_set_1 = X_reshaped[:, selected_feature_indices[:top_k_features_total]]

# Create Data set 2 with top six features per class
data_set_2 = X_reshaped[:, selected_feature_indices]

# Optionally, you can save the new datasets to NumPy files
np.save('data_set_1.npy', data_set_1)
np.save('data_set_2.npy', data_set_2)
print("Data set 1 shape:", data_set_1.shape)
print("Data set 2 shape:", data_set_2.shape)

In [1]:
#Visualization for the newly created datasets

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Load the datasets (replace 'data_set_1.npy' and 'data_set_2.npy' with your actual filenames)
data_set_1 = np.load('data_set_1.npy')
data_set_2 = np.load('data_set_2.npy')

# Assuming 'data_set_1' and 'data_set_2' have shape (number of samples, number of features)

# Step 1: Data Analysis - Exploratory Data Analysis (EDA)

# Visualize the distribution of features for each class (assuming the first column is the class label)
classes = ['happy', 'sad', 'neutral']

for class_idx, class_name in enumerate(classes):
    # Get the features for the current class
    features_class_set_1 = data_set_1[data_set_1[:, 0] == class_idx][:, 1:]
    features_class_set_2 = data_set_2[data_set_2[:, 0] == class_idx][:, 1:]
    
    # Plot histograms for the features of the current class for both datasets
    for feature_idx in range(features_class_set_1.shape[1]):
        plt.figure(figsize=(8, 6))
        plt.hist(features_class_set_1[:, feature_idx], bins=30, alpha=0.5, color='blue', label='Data Set 1')
        plt.hist(features_class_set_2[:, feature_idx], bins=30, alpha=0.5, color='green', label='Data Set 2')
        plt.xlabel(f'Feature {feature_idx + 1}')
        plt.ylabel('Frequency')
        plt.title(f'Histogram of Feature {feature_idx + 1} for {class_name} faces')
        plt.legend()
        plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Load the datasets (replace 'data_set_1.npy' and 'data_set_2.npy' with your actual filenames)
data_set_1 = np.load('data_set_1.npy')
data_set_2 = np.load('data_set_2.npy')
class_labels = np.load("/mnt/c/users/admin/desktop/F21DL/smiley_dataset/smiley_Y.npy")

# Assuming 'data_set_1' and 'data_set_2' have shape (number of samples, number of features)

# Step 1: Data Analysis - Exploratory Data Analysis (EDA)

# Visualize the distribution of features for each class
classes = ['sad', 'neutral', 'happy']

for class_idx, class_name in enumerate(classes):
    # Get the features for the current class
    features_class_set_1 = data_set_1[class_labels == class_idx]
    features_class_set_2 = data_set_2[class_labels == class_idx]
    
    # Plot histograms for the features of the current class for both datasets
    for feature_idx in range(features_class_set_1.shape[1]):
        plt.figure(figsize=(8, 6))
        plt.hist(features_class_set_1[:, feature_idx], bins=30, alpha=0.5, color='blue', label='Data Set 1')
        plt.hist(features_class_set_2[:, feature_idx], bins=30, alpha=0.5, color='green', label='Data Set 2')
        plt.xlabel(f'Feature {feature_idx + 1}')
        plt.ylabel('Frequency')
        plt.title(f'Histogram of Feature {feature_idx + 1} for {class_name} faces')
        plt.legend()
        plt.show()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the datasets (replace 'data_set_1.npy' and 'data_set_2.npy' with your actual filenames)
data_set_1 = np.load('data_set_1.npy')
data_set_2 = np.load('data_set_2.npy')
class_labels = np.load("/mnt/c/users/admin/desktop/F21DL/smiley_dataset/smiley_Y.npy")

# Assuming 'data_set_1' and 'data_set_2' have shape (number of samples, number of features)

# Step 1: Data Analysis - Exploratory Data Analysis (EDA)

# Create DataFrames for data_set_1 and data_set_2
df_data_set_1 = pd.DataFrame(data_set_1[:, 1:], columns=[f'Feature {i + 1}' for i in range(data_set_1.shape[1] - 1)])
df_data_set_1['Class'] = class_labels

df_data_set_2 = pd.DataFrame(data_set_2[:, 1:], columns=[f'Feature {i + 1}' for i in range(data_set_2.shape[1] - 1)])
df_data_set_2['Class'] = class_labels

# Visualize the 3 and 6 top features for each class using box plots
classes = ['sad', 'neutral', 'happy']

for class_idx, class_name in enumerate(classes):
    # Filter the data for the current class
    data_class_set_1 = df_data_set_1[df_data_set_1['Class'] == class_idx]
    data_class_set_2 = df_data_set_2[df_data_set_2['Class'] == class_idx]

    # Plot box plots for the top 3 features for the current class
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=data_class_set_1.iloc[:, :3], palette='Blues')
    plt.title(f'Top 3 Features for {class_name} faces (Data Set 1)')
    plt.xlabel('Feature')
    plt.ylabel('Value')
    plt.show()

    # Plot box plots for the top 6 features for the current class
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=data_class_set_2.iloc[:, :6], palette='Greens')
    plt.title(f'Top 6 Features for {class_name} faces (Data Set 2)')
    plt.xlabel('Feature')
    plt.ylabel('Value')
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the datasets (replace 'data_set_1.npy' and 'data_set_2.npy' with your actual filenames)
data_set_1 = np.load('data_set_1.npy')
data_set_2 = np.load('data_set_2.npy')
class_labels = np.load("/mnt/c/users/admin/desktop/F21DL/smiley_dataset/smiley_Y.npy")

# Assuming 'data_set_1' and 'data_set_2' have shape (number of samples, number of features)

# Step 1: Data Preprocessing - Handling Missing Values

# Convert the numpy arrays to pandas DataFrames
df_data_set_1 = pd.DataFrame(data_set_1[:, 1:], columns=[f'Feature {i + 1}' for i in range(data_set_1.shape[1] - 1)])
df_data_set_1['Class'] = class_labels

df_data_set_2 = pd.DataFrame(data_set_2[:, 1:], columns=[f'Feature {i + 1}' for i in range(data_set_2.shape[1] - 1)])
df_data_set_2['Class'] = class_labels

# Check for missing values in each dataset
print("Missing values in Data Set 1:")
print(df_data_set_1.isnull().sum())

print("\nMissing values in Data Set 2:")
print(df_data_set_2.isnull().sum())

# Step 2: Data Analysis - Exploratory Data Analysis (EDA)

# Handle missing values - Replace NaN values with the mean of the respective feature
df_data_set_1.fillna(df_data_set_1.mean(), inplace=True)
df_data_set_2.fillna(df_data_set_2.mean(), inplace=True)

# Visualize the 3 and 6 top features for each class using box plots
classes = ['sad', 'neutral', 'happy']

for class_idx, class_name in enumerate(classes):
    # Filter the data for the current class
    data_class_set_1 = df_data_set_1[df_data_set_1['Class'] == class_idx]
    data_class_set_2 = df_data_set_2[df_data_set_2['Class'] == class_idx]

    # Plot box plots for the top 3 features for the current class
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=data_class_set_1.iloc[:, :3], palette='Blues')
    plt.title(f'Top 3 Features for {class_name} faces (Data Set 1)')
    plt.xlabel('Feature')
    plt.ylabel('Value')
    plt.show()

    # Plot box plots for the top 6 features for the current class
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=data_class_set_2.iloc[:, :6], palette='Greens')
    plt.title(f'Top 6 Features for {class_name} faces (Data Set 2)')
    plt.xlabel('Feature')
    plt.ylabel('Value')
    plt.show()
