In [None]:
import pandas as pd 
import numpy as np

# Sample DataFrame with missing values and categorical data
data = {
    'age' : [25, 30, 35, np.nan, 40],
    'income' : [50000, 60000, 80000, 70000, 90000],
    'city' : ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}
df = pd.DataFrame(data)

# Fill missing values in 'age' with the median
df['age'].fillna(df['age'].median(), inplace = True)

# Create a new feature: Income per age
df['income_per_age'] = df['income'] / df['age']

# One-hot encode the 'city' column
df_encoded = pd.get_dummies(df, columns = ['city'])
print(df_encoded)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer, OneHotEncoder

# Initialize Spark session
spark = SparkSession.builder.appName("FeatureEngineering").getOrCreate()

# Create a Spark DataFrame
data = [
    (25, 50000, "New York"),
    (30, 60000, "Los Angeles"),
    (35, 70000, "Chicago"),
    (None, 80000, "Houston"),
    (40, 90000, "Phoenix")
]

columns = ["age", "income", "city"]
df_spark = spark.createDataFrame(data, columns)

# Impute missing 'age' values (here using an approximate median)
median_age = df_spark.approxQuantile("age", [0.5], 0.001)[0]
df_spark = df_spark.na.fill({"age" : median_age})

# Create a new feature: Income per age
from pyspark.sql.functions import round
df_spark = df_spark.withColumn("income_per_age", round(col("income") / col("age"), 1))

# Encode the 'city' column
indexer = StringIndexer(inputCol = "city", outputCol = "city_index")
df_spark = indexer.fit(df_spark).transform(df_spark)

df_spark.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml

# Load MNISTdataset (handwritten digits)
mnist = fetch_openml(('mnist_784'), version=1, as_frame=False)
X = mnist.data[:5000] # Use a subset for faster processing

# Apply PCA to reduce dimensions from 784 to 100
pca = PCA(n_components = 100)
X_pca = pca.fit_transform(X)


# Percentage of variance retained
print("Explained Variance Ratio:", np.sum(pca.explained_variance_ratio_))


# Reconstruct images from reduced PCA components
X_reconstructed = pca.inverse_transform(X_pca)


# Display original and reconstructed images
fig, axes = plt.subplots(1, 2, figsize=(8, 4))


# Original image
axes[0].imshow(X[0].reshape(28, 28), cmap ='gray')
axes[0].set_title("Original Image")

# Reconstructed image using 100 PCA components
axes[1].imshow(X_reconstructed[0].reshape(28, 28), cmap ='gray')
axes[1].set_title("Reduced Image (PCA)")

plt.show()

In [None]:
import pandas as pd
import numpy as np

# Sample dataset
df = pd.DataFrame({
    'age' : [25, 30, 35, 40, 45],
    'income' : [50000, 60000, 70000, 80000, 90000],
    'expenses' : [20000, 25000, 30000, 35000, 40000]
})

# Create interaction features
df['income_to_age'] = df['income'] / df['age']
df['savings'] = df['income'] - df['expenses']
df['savings_rate'] = df['savings'] / df['income'] # Savings as a percentage of income

3
print(df)

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

# Sample dataset with numerical features
df = pd.DataFrame({
    'income' : [40000, 50000, 60000, 80000, 100000, 12000],
    'spending_score' : [60, 70, 80, 30, 20, 10]
})

# Apply K-Means clustering
kmeans = KMeans(n_clusters = 3, random_state = 42, n_init = 10)
df['customer_segment'] = kmeans.fit_predict(df[['income', 'spending_score']])
print(df)