In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame
# Checking for different scales using describe() function
description = df.describe()
print(description)

## Inspect the range of values for each feature in the output. If the scales are vastly different, you may need to apply scaling techniques.

In [None]:
# For Pandas DataFrame
null_values = df.isnull().sum()
print(null_values)

# For Scikit-learn
import numpy as np
# Assuming 'X' is your data
missing_values = np.isnan(X).sum()
print(missing_values)

In [None]:
# For Pandas DataFrame
# Remove rows with any null values
df_cleaned = df.dropna()

# Fill null values with a specific value
df_filled = df.fillna(value)

# For Scikit-learn
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # Other strategies: median, most_frequent
X_imputed = imputer.fit_transform(X)


In [None]:
# For Pandas DataFrame
# Assuming 'df' is your DataFrame
categorical_columns = df.select_dtypes(include=['object']).columns
print(categorical_columns)


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalization
min_max_scaler = MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(X)

## When features have different scales. It's essential for algorithms that are sensitive to feature scaling, like gradient descent-based algorithms. Data type: Numeric data.

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# For Pandas DataFrame
df_encoded = pd.get_dummies(df, columns=['categorical_column'])

# For Scikit-learn
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()


df['encoded_column'] = label_encoder.fit_transform(df['categorical_column'])
#or
X_encoded_label = label_encoder.fit_transform(X)

In [None]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler

scaler = MinMaxScaler()  # or MaxAbsScaler
X_scaled = scaler.fit_transform(X)

## When features have different ranges and need to be scaled to the same range.  Numeric data.

In [None]:
from sklearn.decomposition import PCA, TruncatedSVD

pca = PCA(n_components=2)  # or TruncatedSVD
X_reduced = pca.fit_transform(X)

In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame with a target column 'target_column'
class_counts = df['target_column'].value_counts()
print(class_counts)

In [None]:
from sklearn.utils import resample

# Assuming 'df' is your DataFrame with a target column 'target_column'

# Separate majority and minority classes
majority_class = df[df['target_column'] == 'majority_class']
minority_class = df[df['target_column'] == 'minority_class']

# Oversample minority class
minority_class_oversampled = resample(minority_class, replace=True, n_samples=len(majority_class))

# Undersample majority class
majority_class_undersampled = resample(majority_class, replace=False, n_samples=len(minority_class))

# Combine oversampled minority class with majority class
oversampled_df = pd.concat([majority_class, minority_class_oversampled])

# Combine undersampled majority class with minority class
undersampled_df = pd.concat([majority_class_undersampled, minority_class])

In [None]:
from imblearn.over_sampling import SMOTE

# Assuming 'X' is your feature matrix and 'y' is your target vector
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
import numpy as np

# Assuming 'data' is your dataset
# Z-score method
def detect_outliers_zscore(data, threshold=3):
    z_scores = np.abs((data - data.mean()) / data.std())
    return np.where(z_scores > threshold)

outliers_zscore_indices = detect_outliers_zscore(data)

# IQR method
def detect_outliers_iqr(data, threshold=1.5):
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (threshold * IQR)
    upper_bound = Q3 + (threshold * IQR)
    return np.where((data < lower_bound) | (data > upper_bound))

outliers_iqr_indices = detect_outliers_iqr(data)


In [None]:
from sklearn.cluster import KMeans

# Assuming 'data' is your dataset
def detect_outliers_cluster(data, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans.fit(data)
    cluster_centers = kmeans.cluster_centers_
    distances = np.linalg.norm(data - cluster_centers[kmeans.labels_], axis=1)
    threshold = np.percentile(distances, 95)  # Adjust percentile as needed
    return np.where(distances > threshold)

outliers_cluster_indices = detect_outliers_cluster(data)


In [None]:
# Removing outliers
cleaned_data_zscore = data[(np.abs((data - data.mean()) / data.std()) < threshold).all(axis=1)]
cleaned_data_iqr = data[~((data < lower_bound) | (data > upper_bound)).any(axis=1)]
cleaned_data_cluster = np.delete(data, outliers_cluster_indices, axis=0)

# Imputing outliers with mean or median
data_zscore_imputed = np.where((np.abs((data - data.mean()) / data.std()) > threshold), np.nan, data)
data_zscore_imputed = np.where(np.isnan(data_zscore_imputed), np.nanmean(data_zscore_imputed, axis=0), data_zscore_imputed)

data_iqr_imputed = np.where(((data < lower_bound) | (data > upper_bound)), np.nan, data)
data_iqr_imputed = np.where(np.isnan(data_iqr_imputed), np.nanmean(data_iqr_imputed, axis=0), data_iqr_imputed)

# Replace outliers with mean or median
data_cluster_imputed = data.copy()
for index in outliers_cluster_indices:
    data_cluster_imputed[index] = np.nanmedian(data_cluster_imputed, axis=0)
