In [1]:
#Ans 01:

In [2]:
# Min-max scaling is a common technique used in data preprocessing to scale numerical features to a specific
# range, usually between 0 and 1. It's calculated using the formula:
    
#     X_scaled = (X - X_min)/(X_max - X_min)
    
# where:
# X is the original value of the feature.
# X_min is the minimum value of the feature.
# X_max is the maximum value of the feature.

# This scaling method is particularly useful when the features have different scales and ranges, ensuring that all features
# contribute equally to the analysis.

# In Python, you can use the MinMaxScaler from the sklearn.preprocessing module to perform min-max scaling.

In [3]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Sample data
data = {
    'Feature1': [10, 20, 30, 40, 50],
    'Feature2': [100, 200, 300, 400, 500]
}

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the data using MinMaxScaler
scaled_data = scaler.fit_transform(df)

# Convert the scaled data back to a DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

print("Original Data:")
print(df)
print("\nScaled Data:")
print(scaled_df)

Original Data:
   Feature1  Feature2
0        10       100
1        20       200
2        30       300
3        40       400
4        50       500

Scaled Data:
   Feature1  Feature2
0      0.00      0.00
1      0.25      0.25
2      0.50      0.50
3      0.75      0.75
4      1.00      1.00


In [4]:
#######################################################################
#Ans 02:

In [5]:
# The Unit Vector technique, also known as normalization or vector normalization, scales each sample (row) in a
# dataset to have a vector norm of 1. This technique is often used in machine learning when the magnitude of the individual
# samples is important, but the direction does not matter.

# In essence, unit vector scaling adjusts the values within each sample (row) without considering the distribution of values
# across different features, unlike Min-Max scaling.

# The formula for unit vector scaling is:
    
#     X_scaled = X/(∣∣X∣∣)

# where:
# X is the original vector.
# ∣∣X∣∣ is the Euclidean norm (magnitude) of the vector.

# In contrast, Min-Max scaling scales features to a specific range (like 0 to 1), considering the minimum and maximum values of
# each feature independently.

# Let's illustrate the difference with an example in Python, comparing unit vector scaling and Min-Max scaling:

In [6]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
import pandas as pd

# Sample data
data = {
    'Feature1': [10, 20, 30, 40, 50],
    'Feature2': [100, 200, 300, 400, 500]
}

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Min-Max scaling
scaler = MinMaxScaler()
minmax_scaled_data = scaler.fit_transform(df)

# Unit vector scaling
normalizer = Normalizer(norm='l2')  # Using L2 normalization for unit vector scaling
unit_vector_scaled_data = normalizer.fit_transform(df)

# Convert the scaled data back to DataFrames
minmax_scaled_df = pd.DataFrame(minmax_scaled_data, columns=df.columns)
unit_vector_scaled_df = pd.DataFrame(unit_vector_scaled_data, columns=df.columns)

print("Original Data:")
print(df)
print("\nMin-Max Scaled Data:")
print(minmax_scaled_df)
print("\nUnit Vector Scaled Data:")
print(unit_vector_scaled_df)

Original Data:
   Feature1  Feature2
0        10       100
1        20       200
2        30       300
3        40       400
4        50       500

Min-Max Scaled Data:
   Feature1  Feature2
0      0.00      0.00
1      0.25      0.25
2      0.50      0.50
3      0.75      0.75
4      1.00      1.00

Unit Vector Scaled Data:
   Feature1  Feature2
0  0.099504  0.995037
1  0.099504  0.995037
2  0.099504  0.995037
3  0.099504  0.995037
4  0.099504  0.995037


In [7]:
#######################################################################
#Ans 03:

In [8]:
# PCA, or Principal Component Analysis, is a technique used for dimensionality reduction in data by transforming
# the original features into a new set of orthogonal (uncorrelated) features called principal components. It aims to
# capture the maximum variance in the data while reducing the number of features.

# The steps involved in PCA are:

# Standardization: Standardize the data by subtracting the mean and scaling to unit variance.
# Compute Covariance Matrix: Calculate the covariance matrix of the standardized data.
# Compute Eigenvectors and Eigenvalues: Perform eigen decomposition on the covariance matrix to obtain eigenvectors and eigenvalues.
# Select Principal Components: Sort the eigenvectors by their corresponding eigenvalues in descending order. The eigenvectors with
# the highest eigenvalues (explained variance) are the principal components.
# Project Data onto Principal Components: Transform the data onto the new feature space formed by the selected principal components.

# Here's an example of PCA using Python's PCA module from sklearn.decomposition:

In [9]:
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import pandas as pd

# Load Iris dataset as an example
iris = load_iris()
data = iris.data
columns = [f"Feature_{i+1}" for i in range(data.shape[1])]

# Convert data to a DataFrame
df = pd.DataFrame(data, columns=columns)

# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 components for illustration
pca_result = pca.fit_transform(df)

# Convert PCA results to a DataFrame
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

print("Original Data:")
print(df.head())
print("\nPCA Transformed Data (First 5 rows):")
print(pca_df.head())
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

Original Data:
   Feature_1  Feature_2  Feature_3  Feature_4
0        5.1        3.5        1.4        0.2
1        4.9        3.0        1.4        0.2
2        4.7        3.2        1.3        0.2
3        4.6        3.1        1.5        0.2
4        5.0        3.6        1.4        0.2

PCA Transformed Data (First 5 rows):
        PC1       PC2
0 -2.684126  0.319397
1 -2.714142 -0.177001
2 -2.888991 -0.144949
3 -2.745343 -0.318299
4 -2.728717  0.326755

Explained Variance Ratio:
[0.92461872 0.05306648]


In [12]:
#######################################################################
#Ans 04:

In [14]:
# PCA and feature extraction are inherently related concepts. PCA can be utilized as a feature extraction technique
# wherein it extracts a reduced set of features (principal components) from the original features, effectively performing
# dimensionality reduction while retaining the most significant information.

# Feature extraction involves deriving new features from the original set of features, aiming to represent the data more
# effectively or efficiently. PCA achieves this by identifying the directions, or axes, in the data that capture the most
# variance. These axes become the principal components, which are a linear combination of the original features.

In [15]:
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import pandas as pd

# Load Iris dataset as an example
iris = load_iris()
data = iris.data
columns = [f"Feature_{i+1}" for i in range(data.shape[1])]

# Convert data to a DataFrame
df = pd.DataFrame(data, columns=columns)

# Apply PCA for feature extraction
pca = PCA(n_components=2)  # Reduce to 2 components for illustration
pca_result = pca.fit_transform(df)

# Convert PCA results to a DataFrame
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

print("Original Data:")
print(df.head())
print("\nPCA Extracted Features (First 5 rows):")
print(pca_df.head())

Original Data:
   Feature_1  Feature_2  Feature_3  Feature_4
0        5.1        3.5        1.4        0.2
1        4.9        3.0        1.4        0.2
2        4.7        3.2        1.3        0.2
3        4.6        3.1        1.5        0.2
4        5.0        3.6        1.4        0.2

PCA Extracted Features (First 5 rows):
        PC1       PC2
0 -2.684126  0.319397
1 -2.714142 -0.177001
2 -2.888991 -0.144949
3 -2.745343 -0.318299
4 -2.728717  0.326755


In [16]:
#######################################################################
#Ans 05:

In [17]:
# In the context of building a recommendation system for a food delivery service, Min-Max scaling can be a valuable
# preprocessing step to normalize the numerical features like price, rating, and delivery time. This normalization technique
# would bring these features to a common scale, typically between 0 and 1, allowing them to contribute equally to the recommendation
# model without affecting their relative differences.


# Here's a step-by-step explanation of how Min-Max scaling can be applied to preprocess the data:

# 1. Identify Numerical Features: Identify the numerical features in the dataset that require scaling. In your case, it's likely to be
# features like price, rating, and delivery time.


# 2. Use Min-Max Scaling: Apply Min-Max scaling to these numerical features. For each feature, follow these steps:

# Compute the minimum and maximum values of the feature.

# Scale the values of the feature using the Min-Max scaling formula:
    
#     X_scaled = (X - X_min)/(X_max − X_min)
    
# where:
# X is the original value of the feature.
# X_min is the minimum value of the feature.
# X_max is the maximum value of the feature.

# 3. Implement Min-Max Scaling in Python:

In [18]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Sample data
# Assuming df is your DataFrame containing the dataset

# Select numerical features for scaling
numerical_features = ['price', 'rating', 'delivery_time']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Scale the numerical features
df[numerical_features] = scaler.fit_transform(df[numerical_features])

KeyError: "None of [Index(['price', 'rating', 'delivery_time'], dtype='object')] are in the [columns]"

In [19]:
# 4. Check the Scaled Data: Verify that the scaling has been applied correctly and that the values for each
# numerical feature now fall within the range of 0 to 1.

# 5. Use Scaled Data in the Recommendation System: Utilize the preprocessed, scaled data as input to your recommendation
# system model. The scaled features will contribute equally to the model without the bias caused by differing scales.

# By performing Min-Max scaling, the recommendation system can effectively use these normalized numerical features to provide
# accurate recommendations, ensuring that no single feature dominates the model due to its larger scale or range.

In [20]:
#######################################################################
#Ans 06:

In [21]:
# In the context of predicting stock prices using a dataset with numerous features like company financial data
# and market trends, PCA (Principal Component Analysis) can be instrumental in reducing the dimensionality of the dataset
# while retaining the most significant information.

# Here's a step-by-step approach on how PCA can be utilized for dimensionality reduction in a stock price prediction project:

# Data Preparation:
# Identify the features relevant to stock price prediction in your dataset, which could include financial indicators, market trends,
# stock volumes, etc.

# Standardization:
# Standardize the features to ensure they have a mean of 0 and a standard deviation of 1. This step is crucial for PCA as it works
# best with standardized data.

# Apply PCA:
# Use PCA to identify the principal components that capture the most variance in the dataset.
# Decide on the number of principal components to retain. This decision can be based on the cumulative explained variance ratio.

# Implement PCA in Python:

In [22]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Sample data (assuming df contains the dataset)
# Select features for PCA
selected_features = ['feature_1', 'feature_2', ...]  # Include relevant features

# Subset the dataframe with selected features
subset_data = df[selected_features]

# Standardize the data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(subset_data)

# Apply PCA
pca = PCA(n_components=10)  # Choose the number of components
pca_result = pca.fit_transform(standardized_data)

# Create a DataFrame with PCA results
pca_df = pd.DataFrame(data=pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])

KeyError: "None of [Index(['feature_1', 'feature_2', Ellipsis], dtype='object')] are in the [columns]"

In [23]:
# Analyze Explained Variance:
# Check the explained variance ratio (pca.explained_variance_ratio_) to understand how much variance each principal component captures.
# Consider the cumulative explained variance to decide the number of components to retain.

# Utilize Reduced Dimensionality Data:
# Use the transformed data with reduced dimensions (principal components) in your stock price prediction model.

# By applying PCA, you'll reduce the dimensionality of the dataset while retaining the most critical information captured by the
# principal components. This can lead to more efficient models by focusing on the most informative features and potentially improving
# prediction accuracy and reducing overfitting, especially when dealing with a large number of features in stock market datasets.

In [24]:
#######################################################################
#Ans 07:

In [25]:
# To perform Min-Max scaling on a dataset to transform values to a range of -1 to 1, you can use the following
# formula:
    
#     X_scaled = ((X − X_min)/(X_max − X_min)) × (max_range − min_range) + min_range
    
# In this case, the original dataset is [1, 5, 10, 15, 20], and we want to transform these values to a range of -1 to 1. The steps
# involved are:

# Compute X_min and X_max from the original dataset.
# Use the Min-Max scaling formula to transform the values to the desired range.

In [26]:
data = [1, 5, 10, 15, 20]
min_range = -1
max_range = 1

# Compute X_min and X_max
X_min = min(data)
X_max = max(data)

# Perform Min-Max scaling
scaled_data = [
    ((x - X_min) / (X_max - X_min)) * (max_range - min_range) + min_range
    for x in data
]

print("Original Data:", data)
print("Min-Max Scaled Data (-1 to 1):", scaled_data)

Original Data: [1, 5, 10, 15, 20]
Min-Max Scaled Data (-1 to 1): [-1.0, -0.5789473684210527, -0.052631578947368474, 0.4736842105263157, 1.0]


In [27]:
# This code snippet will compute the Min-Max scaling for the provided dataset [1, 5, 10, 15, 20], transforming the
# values to fit within the range of -1 to 1.

In [28]:
#######################################################################
#Ans 08:

In [29]:
# Deciding on the number of principal components to retain in PCA involves considering the trade-off between
# reducing dimensionality and retaining sufficient variance in the data. Here are steps to determine the number of
# principal components to retain for the given features: height, weight, age, gender, and blood pressure:

# 1. Standardize the Data: Scale the features to have zero mean and unit variance as PCA works best on standardized data.

# 2. Apply PCA:
# Use PCA to compute the principal components from the standardized data.
# Analyze the explained variance ratio (explained_variance_ratio_) for each principal component.

# 3. Select Number of Components:
# Examine the cumulative explained variance ratio to understand how much variance is retained as the number of components increases.
# Retain enough principal components that capture a significant portion of the total variance in the data. A common rule is to retain
# components that explain a high percentage of the total variance (e.g., 95% or more).
# Alternatively, consider the "elbow" in the explained variance plot, where adding more components yields diminishing returns in
# explained variance.

# Let's assume we have standardized data and apply PCA to determine the number of principal components to retain:

In [30]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Sample data (replace this with your actual dataset)
# Assuming 'data' contains the features: height, weight, age, gender, blood pressure

# Standardize the data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(data)

# Apply PCA
pca = PCA()
pca.fit(standardized_data)

# Get explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Determine number of components to retain
cumulative_variance = explained_variance_ratio.cumsum()
num_components = len([variance for variance in cumulative_variance if variance <= 0.95])  # Retain 95% variance

print("Explained Variance Ratio:", explained_variance_ratio)
print("Cumulative Variance Ratio:", cumulative_variance)
print("Number of Components to Retain for 95% variance:", num_components)

ValueError: Expected 2D array, got 1D array instead:
array=[ 1.  5. 10. 15. 20.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [31]:
# This code snippet performs PCA on the standardized data and calculates the explained variance ratio for each
# principal component. The num_components variable is computed based on retaining 95% of the variance. You can adjust
# this threshold based on the trade-off between dimensionality reduction and retaining information.

In [33]:
#######################################################################