In [1]:
# Ans 01:

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

# Step 1: Automated feature selection
feature_selection_model = RandomForestClassifier()  # Example feature selection model
feature_selector = SelectFromModel(feature_selection_model)

# Step 2: Numerical pipeline
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values using mean
    ('scaler', StandardScaler())  # Scale the numerical columns using standardization
])

# Step 3: Categorical pipeline
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values using most frequent value
    ('onehot', OneHotEncoder())  # One-hot encode the categorical columns
])

# Step 4: Combine numerical and categorical pipelines using ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),  # numerical_features is a list of numerical feature names
    ('cat', categorical_pipeline, categorical_features)  # categorical_features is a list of categorical feature names
])

# Step 5: Build the final pipeline with feature selection and Random Forest Classifier
pipeline = Pipeline([
    ('feature_selection', feature_selector),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Step 6: Train and evaluate the model
pipeline.fit(X_train, y_train)  # X_train and y_train are the training features and labels
accuracy = pipeline.score(X_test, y_test)  # Evaluate accuracy on the test dataset

print("Accuracy of the Random Forest Classifier:", accuracy)

NameError: name 'numerical_features' is not defined

In [2]:
# Explanation of Steps:

# 1. Automated Feature Selection: A feature selection model (in this case, a Random Forest Classifier) is used to automatically select important features
# from the dataset.
# 2. Numerical Pipeline: Imputes missing values in numerical columns using the mean and standardizes the numerical columns.
# 3. Categorical Pipeline: Imputes missing values in categorical columns using the most frequent value and performs one-hot encoding on categorical columns.
# 4. ColumnTransformer: Combines the numerical and categorical pipelines to handle both types of features simultaneously.
# 5. Final Pipeline: Incorporates feature selection, data preprocessing, and the Random Forest Classifier into a single pipeline.
# 6. Train and Evaluate: The pipeline is trained on the training dataset and evaluated for accuracy on the test dataset.

# Interpretation of Results:
# The accuracy of the model on the test dataset indicates how well the Random Forest Classifier performs on unseen data. A higher accuracy suggests that the
# model generalizes well to new data.

# Possible Improvements:

# 1. Hyperparameter Tuning: Fine-tune hyperparameters of the Random Forest Classifier and other components of the pipeline using techniques like grid search or
# random search.
# 2. Feature Engineering: Experiment with different feature engineering techniques to enhance the predictive power of the model.
# 3. Model Evaluation: Consider using additional evaluation metrics (e.g., precision, recall, F1-score) to assess the model's performance comprehensively.
# 4. Feature Importance Analysis: Analyze feature importance scores to gain insights into which features contribute the most to the model's predictions and
# potentially refine the feature selection process.
# 5. Handling Imbalanced Data: If the dataset is imbalanced, consider implementing techniques such as oversampling, undersampling, or using class weights to
# address class imbalance issues.

In [3]:
#####################################################################################################################
# Ans 02:

In [4]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [5]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the classifiers
rf_clf = RandomForestClassifier(random_state=42)
lr_clf = LogisticRegression(random_state=42)

# Build the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize the features
    ('voting', VotingClassifier([
        ('random_forest', rf_clf),
        ('logistic_regression', lr_clf)
    ]))
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Evaluate the accuracy
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Voting Classifier:", accuracy)

Accuracy of the Voting Classifier: 1.0


In [6]:
# In this code:

# 1. We first load the Iris dataset and split it into training and testing sets.
# 2. Then, we define two classifiers: a Random Forest Classifier and a Logistic Regression Classifier.
# 3. Next, we build a pipeline with a StandardScaler to standardize the features and a Voting Classifier to combine the predictions of the two classifiers.
# 4. Finally, we train the pipeline on the training data and evaluate its accuracy on the test data using the accuracy_score function from scikit-learn.

# This pipeline leverages the strengths of both classifiers by combining their predictions using a majority voting scheme. The accuracy of the Voting Classifier
# represents the overall performance of the ensemble model.

In [7]:
#####################################################################################################################