In [13]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.preprocessing import PowerTransformer
from sklearn.utils import resample

In [14]:
# Load the Iris dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
column_names = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
iris = pd.read_csv(url, header=None, names=column_names)

In [15]:
# Data Cleaning
# Handle missing data by filling missing values with the mean
iris.fillna(iris.mean(), inplace=True)

  iris.fillna(iris.mean(), inplace=True)


In [16]:
# Remove duplicate records
iris.drop_duplicates(inplace=True)

In [42]:
# Correct inaccuracies (assuming some inaccuracies are present)
# Example: Correcting inconsistent class labels
iris['class'] = iris['class'].str.strip()
print(iris)

     sepal_length  sepal_width  petal_length  petal_width           class
0       -0.915509     1.019971     -1.357737    -1.335700     Iris-setosa
1       -1.157560    -0.128082     -1.357737    -1.335700     Iris-setosa
2       -1.399610     0.331139     -1.414778    -1.335700     Iris-setosa
3       -1.520635     0.101529     -1.300696    -1.335700     Iris-setosa
4       -1.036535     1.249582     -1.357737    -1.335700     Iris-setosa
..            ...          ...           ...          ...             ...
145      1.020892    -0.128082      0.809831     1.444682  Iris-virginica
146      0.536792    -1.276136      0.695748     0.915085  Iris-virginica
147      0.778842    -0.128082      0.809831     1.047484  Iris-virginica
148      0.415766     0.790361      0.923913     1.444682  Iris-virginica
149      0.052691    -0.128082      0.752789     0.782686  Iris-virginica

[147 rows x 5 columns]


In [19]:
# Data Transformation
# Standardization/Normalization
scaler = StandardScaler()
iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] = scaler.fit_transform(iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

In [43]:
# Encoding categorical variables using one-hot encoding
iris_encoded = pd.get_dummies(iris, columns=['class'])
print(iris_encoded)

     sepal_length  sepal_width  petal_length  petal_width  class_Iris-setosa  \
0       -0.915509     1.019971     -1.357737    -1.335700                  1   
1       -1.157560    -0.128082     -1.357737    -1.335700                  1   
2       -1.399610     0.331139     -1.414778    -1.335700                  1   
3       -1.520635     0.101529     -1.300696    -1.335700                  1   
4       -1.036535     1.249582     -1.357737    -1.335700                  1   
..            ...          ...           ...          ...                ...   
145      1.020892    -0.128082      0.809831     1.444682                  0   
146      0.536792    -1.276136      0.695748     0.915085                  0   
147      0.778842    -0.128082      0.809831     1.047484                  0   
148      0.415766     0.790361      0.923913     1.444682                  0   
149      0.052691    -0.128082      0.752789     0.782686                  0   

     class_Iris-versicolor  class_Iris-

In [21]:
# Feature engineering: Create a new feature 'sepal_ratio'
iris_encoded['sepal_ratio'] = iris_encoded['sepal_length'] / iris_encoded['sepal_width']

In [57]:
# Data Reduction
# Dimensionality reduction using PCA
pca = PCA(n_components=2)
iris_pca = pca.fit_transform(iris_encoded[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])
print("\nProcessed Data:")
print(iris_encoded.head())


Processed Data:
   sepal_length  sepal_width  petal_length  petal_width  class_Iris-setosa  \
0     -0.915509     1.019971     -1.357737      -1.3357                  1   
1     -1.157560    -0.128082     -1.357737      -1.3357                  1   
2     -1.399610     0.331139     -1.414778      -1.3357                  1   
3     -1.520635     0.101529     -1.300696      -1.3357                  1   
4     -1.036535     1.249582     -1.357737      -1.3357                  1   

   class_Iris-versicolor  class_Iris-virginica  sepal_width_transformed  \
0                      0                     0                 1.020220   
1                      0                     0                -0.070467   
2                      0                     0                 0.384710   
3                      0                     0                 0.161162   
4                      0                     0                 1.223163   

   sepal_length_scaled date_column day_of_week  month  year  
0

In [28]:
# Outlier detection and removal (using z-score as an example)
z_scores = (iris_encoded[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] -
            iris_encoded[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].mean())
/ iris_encoded[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].std()
iris_no_outliers = iris_encoded[(z_scores.abs() < 3).all(axis=1)]

In [24]:
# Data Discretization
# Convert 'petal_length' into discrete bins
iris_encoded['petal_length_bin'] = pd.cut(iris_encoded['petal_length'], bins=[0, 2, 5, 10], labels=['short', 'medium', 'long'])

In [54]:
# Handling Imbalanced Data (assuming binary classification)
# Upsample the minority class
minority_class = 'class_Iris-versicolor'  # Assuming this is the minority class
majority_class = 'class_Iris-setosa'  # Assuming this is the majority class
minority_class_data = iris_encoded[iris_encoded[minority_class] == 1]
majority_class_data = iris_encoded[iris_encoded[majority_class] == 1]

minority_upsampled = resample(minority_class_data, replace=True, n_samples=len(majority_class_data), random_state=42)
iris_balanced = pd.concat([majority_class_data, minority_upsampled])
print("\nProcessed Data:")
print(iris_encoded.head())



Processed Data:
   sepal_length  sepal_width  petal_length  petal_width  class_Iris-setosa  \
0     -0.915509     1.019971     -1.357737      -1.3357                  1   
1     -1.157560    -0.128082     -1.357737      -1.3357                  1   
2     -1.399610     0.331139     -1.414778      -1.3357                  1   
3     -1.520635     0.101529     -1.300696      -1.3357                  1   
4     -1.036535     1.249582     -1.357737      -1.3357                  1   

   class_Iris-versicolor  class_Iris-virginica  sepal_width_transformed  \
0                      0                     0                 1.020220   
1                      0                     0                -0.070467   
2                      0                     0                 0.384710   
3                      0                     0                 0.161162   
4                      0                     0                 1.223163   

   sepal_length_scaled date_column day_of_week  month  year  
0

In [33]:
# Text Data Processing (if applicable)
# Tokenization, stopword removal, and lemmatization (using a simple example)
text_data = ["This is an example sentence.", "Another sentence for demonstration."]
vectorizer = CountVectorizer(stop_words='english')  # Use 'english' instead of ENGLISH_STOP_WORDS
text_matrix = vectorizer.fit_transform(text_data)


In [53]:
# Date and Time Handling (hypothetical 'date_column')
iris_encoded['date_column'] = pd.to_datetime('2023-01-01')  # Hypothetical date for demonstration
iris_encoded['day_of_week'] = iris_encoded['date_column'].dt.day_name()
iris_encoded['month'] = iris_encoded['date_column'].dt.month
iris_encoded['year'] = iris_encoded['date_column'].dt.year
print("\nProcessed Data:")
print(iris_encoded.head())



Processed Data:
   sepal_length  sepal_width  petal_length  petal_width  class_Iris-setosa  \
0     -0.915509     1.019971     -1.357737      -1.3357                  1   
1     -1.157560    -0.128082     -1.357737      -1.3357                  1   
2     -1.399610     0.331139     -1.414778      -1.3357                  1   
3     -1.520635     0.101529     -1.300696      -1.3357                  1   
4     -1.036535     1.249582     -1.357737      -1.3357                  1   

   class_Iris-versicolor  class_Iris-virginica  sepal_width_transformed  \
0                      0                     0                 1.020220   
1                      0                     0                -0.070467   
2                      0                     0                 0.384710   
3                      0                     0                 0.161162   
4                      0                     0                 1.223163   

   sepal_length_scaled date_column day_of_week  month  year  
0

In [45]:
# Data Splitting
target_column = 'sepal_width'  # Replace with your actual target column name
X_train, X_test, y_train, y_test = train_test_split(iris_encoded.drop(target_column, axis=1), iris_encoded[target_column], test_size=0.2, random_state=42)


In [49]:
# Applying PowerTransformer
iris_encoded[target_column + '_transformed'] = PowerTransformer().fit_transform(iris_encoded[[target_column]])

In [51]:
# Scaling the 'sepal_length' column
iris_encoded['sepal_length_scaled'] = StandardScaler().fit_transform(iris_encoded[['sepal_length']])

In [52]:
# Display the processed data
print("\nProcessed Data:")
print(iris_encoded.head())


Processed Data:
   sepal_length  sepal_width  petal_length  petal_width  class_Iris-setosa  \
0     -0.915509     1.019971     -1.357737      -1.3357                  1   
1     -1.157560    -0.128082     -1.357737      -1.3357                  1   
2     -1.399610     0.331139     -1.414778      -1.3357                  1   
3     -1.520635     0.101529     -1.300696      -1.3357                  1   
4     -1.036535     1.249582     -1.357737      -1.3357                  1   

   class_Iris-versicolor  class_Iris-virginica  sepal_width_transformed  \
0                      0                     0                 1.020220   
1                      0                     0                -0.070467   
2                      0                     0                 0.384710   
3                      0                     0                 0.161162   
4                      0                     0                 1.223163   

   sepal_length_scaled  
0            -0.915509  
1            