# Artificial Intelligence and Data Science in Operations Research
## Preprocssing Data Activity

For this activity, you will use the predefined methods to preprocess your data. Ensure that you are considering the combination of methods and justify your selection of methods as well as your approach.

In [1]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def min_max_normalization(data, feature_range=(0, 1)):
    """
    Applies Min-Max normalization to the dataset.
    
    Parameters:
    data (array-like): The dataset to be normalized.
    feature_range (tuple): Desired range of transformed data.
    
    Returns:
    array-like: Normalized dataset.
    """
    scaler = MinMaxScaler(feature_range=feature_range)
    normalized_data = scaler.fit_transform(data)
    return normalized_data

# Example usage:
data = np.array([[1, 2], [3, 4], [5, 6]])
normalized_data = min_max_normalization(data, feature_range=(0, 1))
print(normalized_data)


[[0.  0. ]
 [0.5 0.5]
 [1.  1. ]]


In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np

def standardize_data(data):
    """
    Applies standardization (Z-score normalization) to the dataset.
    
    Parameters:
    data (array-like): The dataset to be standardized.
    
    Returns:
    array-like: Standardized dataset.
    """
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(data)
    return standardized_data

# Example usage:
data = np.array([[1, 2], [3, 4], [5, 6]])
standardized_data = standardize_data(data)
print(standardized_data)


[[-1.22474487 -1.22474487]
 [ 0.          0.        ]
 [ 1.22474487  1.22474487]]


In [3]:
from sklearn.preprocessing import Binarizer
import numpy as np

def binarize_data(data, threshold=0.0):
    """
    Binarizes the dataset based on a threshold.
    
    Parameters:
    data (array-like): The dataset to be binarized.
    threshold (float): The threshold value to binarize data.
    
    Returns:
    array-like: Binarized dataset.
    """
    binarizer = Binarizer(threshold=threshold)
    binarized_data = binarizer.fit_transform(data)
    return binarized_data

# Example usage:
data = np.array([[1, 2], [3, 4], [5, 6]])
binarized_data = binarize_data(data, threshold=3)
print(binarized_data)


[[0 0]
 [0 1]
 [1 1]]


In [4]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

def add_polynomial_features(data, degree=2):
    """
    Generates polynomial features of the dataset.
    
    Parameters:
    data (array-like): The dataset to be transformed.
    degree (int): The degree of the polynomial features.
    
    Returns:
    array-like: Dataset with polynomial features.
    """
    poly = PolynomialFeatures(degree=degree)
    poly_features = poly.fit_transform(data)
    return poly_features

# Example usage:
data = np.array([[1, 2], [3, 4], [5, 6]])
poly_features = add_polynomial_features(data, degree=2)
print(poly_features)


[[ 1.  1.  2.  1.  2.  4.]
 [ 1.  3.  4.  9. 12. 16.]
 [ 1.  5.  6. 25. 30. 36.]]


In [6]:
from sklearn.preprocessing import LabelEncoder

def encode_labels(labels):
    """
    Encodes categorical labels with values between 0 and n_classes-1.
    
    Parameters:
    labels (array-like): The labels to be encoded.
    
    Returns:
    array-like: Encoded labels.
    """
    encoder = LabelEncoder()
    encoded_labels = encoder.fit_transform(labels)
    return encoded_labels

# Example usage:
labels = ['cat', 'dog', 'fish', 'dog', 'cat']
encoded_labels = encode_labels(labels)
print(encoded_labels)


[0 1 2 1 0]


In [7]:
from sklearn.decomposition import PCA
import numpy as np

def apply_pca(data, n_components=None):
    """
    Applies Principal Component Analysis (PCA) to the dataset.
    
    Parameters:
    data (array-like): The dataset to be transformed.
    n_components (int, float, None or str): Number of components to keep.
    
    Returns:
    array-like: Transformed dataset with principal components.
    """
    pca = PCA(n_components=n_components)
    pca_data = pca.fit_transform(data)
    return pca_data

# Example usage:
data = np.array([[1, 2], [3, 4], [5, 6]])
pca_data = apply_pca(data, n_components=2)
print(pca_data)


[[-2.82842712  0.        ]
 [ 0.          0.        ]
 [ 2.82842712  0.        ]]


In [13]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Fisher's Linear Discriminant function
def fisher_score(data, target):
    """
    Computes Fisher score for each feature.
    
    Parameters:
    data (array-like): The dataset with features.
    target (array-like): The target variable.
    
    Returns:
    array-like: Fisher scores for each feature.
    """
    unique_classes = np.unique(target)
    mean_overall = np.mean(data, axis=0)
    
    numerator = np.zeros(data.shape[1])
    denominator = np.zeros(data.shape[1])
    
    for cls in unique_classes:
        cls_data = data[target == cls]
        mean_cls = np.mean(cls_data, axis=0)
        numerator += cls_data.shape[0] * (mean_cls - mean_overall) ** 2
        denominator += np.sum((cls_data - mean_cls) ** 2, axis=0)
    
    fisher_scores = numerator / denominator
    return fisher_scores

def rank_features_fisher(data, target):
    """
    Ranks features using Fisher's Linear Discriminant.
    
    Parameters:
    data (array-like): The dataset with features.
    target (array-like): The target variable.
    
    Returns:
    array-like: Ranking of features.
    """
    scores = fisher_score(data, target)
    ranking = np.argsort(scores)[::-1] + 1
    return ranking

# Generate a synthetic dataset
data, target = make_classification(n_samples=100, n_features=10, n_informative=5, n_redundant=2, n_classes=3, random_state=42)

# Apply Fisher's Linear Discriminant feature ranking
feature_ranking_fisher = rank_features_fisher(data, target)

print("Feature Ranking (Fisher's Linear Discriminant):", feature_ranking_fisher)

# Display the Fisher scores for each feature
fisher_scores = fisher_score(data, target)
print("Fisher Scores for each feature:", fisher_scores)


Feature Ranking (Fisher's Linear Discriminant): [10  1  9  2  7  8  5  6  3  4]
Fisher Scores for each feature: [0.27896741 0.19563216 0.02062529 0.0104751  0.04265937 0.04115039
 0.14824801 0.109263   0.25293161 0.61988189]


In [11]:
import numpy as np

def simple_oversample_data(data, target):
    """
    Applies simple random oversampling to the dataset.
    
    Parameters:
    data (array-like): The dataset with features.
    target (array-like): The target variable.
    
    Returns:
    tuple: Oversampled dataset and target.
    """
    # Separate minority and majority classes
    unique_classes, class_counts = np.unique(target, return_counts=True)
    minority_class = unique_classes[np.argmin(class_counts)]
    majority_class = unique_classes[np.argmax(class_counts)]

    # Indices of minority and majority class samples
    minority_indices = np.where(target == minority_class)[0]
    majority_indices = np.where(target == majority_class)[0]

    # Number of samples to match majority class
    num_to_oversample = len(majority_indices) - len(minority_indices)

    # Randomly replicate samples from the minority class
    oversampled_indices = np.random.choice(minority_indices, num_to_oversample, replace=True)

    # Combine original data with oversampled minority class samples
    oversampled_data = np.concatenate([data, data[oversampled_indices]])
    oversampled_target = np.concatenate([target, target[oversampled_indices]])

    return oversampled_data, oversampled_target

# Example usage:
data = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
target = np.array([0, 1, 0, 1, 0])
oversampled_data, oversampled_target = simple_oversample_data(data, target)
print(oversampled_data)
print(oversampled_target)


[[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]
 [ 7  8]]
[0 1 0 1 0 1]


## Now, use these on the below sampled dataset from the Iris dataset:

In [12]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()
data = iris.data
target = iris.target

# Combine the data and target into a single DataFrame for easy manipulation
iris_df = pd.DataFrame(data, columns=iris.feature_names)
iris_df['target'] = target

# Randomly sample 50% of the observations
iris_df_sampled = iris_df.sample(frac=0.5, random_state=42).reset_index(drop=True)

# Separate the features and the target
data_sampled = iris_df_sampled.drop(columns=['target']).values
target_sampled = iris_df_sampled['target'].values