# Loading Data

In [None]:
import pandas as pd
filename = "ILPD.csv"
col_names = ['age', 'gen', 'tbili', 'dbili', 'alkphos', 'sgpt',
             'sgot', 'tp', 'alb', 'ag', 'class']
data = pd.read_csv(filename,names=col_names)

In [None]:
print(data)

In [None]:
print(data.shape)

In [None]:
data[['age', 'tbili', 'dbili', 'alkphos', 'sgpt',
             'sgot', 'tp', 'alb', 'ag']].std()

In [None]:
data.max()

In [None]:
data[['age', 'tbili', 'dbili', 'alkphos', 'sgpt',
             'sgot', 'tp', 'alb', 'ag']].quantile(0.25)

# Visualization

In [None]:
import matplotlib.pyplot as plt

import pandas as pd

filename = "ILPD.csv"
col_names = ['age', 'gen', 'tbili', 'dbili', 'alkphos', 'sgpt',
             'sgot', 'tp', 'alb', 'ag', 'class']
data = pd.read_csv(filename,names=col_names)

input_data = data[data.columns[:-1]]
# Histograms
input_data.hist()
# Density plots
input_data.plot(kind='density',subplots=True, layout=(3,3), sharex=False)
# Box plots
input_data.plot(kind='box', subplots=True, layout=(3,3),sharex=False,sharey=False)

plt.plot()

# Data Selection

In [None]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

filename = "ILPD.csv"
col_names = ['age', 'gen', 'tbili', 'dbili', 'alkphos', 'sgpt',
             'sgot', 'tp', 'alb', 'ag', 'class']

# Load dataset
data = pd.read_csv(filename, names=col_names)
data.fillna(0, inplace=True)

# Separate input and output
X_raw = data[data.columns[:-1]]
y = data['class']

# Apply OneHotEncoder to 'gen' column
try:
    encoder = OneHotEncoder(sparse_output=False)  # sklearn >= 1.2
except TypeError:
    encoder = OneHotEncoder(sparse=False)        # sklearn < 1.2

encoded = encoder.fit_transform(X_raw[['gen']])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['gen']))

# Merge numerical + encoded categorical variables
X = pd.concat([X_raw.drop(columns=['gen']).reset_index(drop=True),
               encoded_df.reset_index(drop=True)], axis=1)

# Define base model
logReg = LogisticRegression(solver="liblinear")

# Apply RFE
rfe = RFE(logReg)
rfe_model = rfe.fit(X, y)

# Show results
print("Number of features selected: ", rfe_model.n_features_)
print("Support (True=selected): ", rfe_model.support_)
print("Ranking: ", rfe_model.ranking_)

# Optional: show results in a readable DataFrame
features_ranking = pd.DataFrame({
    "Feature": X.columns,
    "Selected": rfe_model.support_,
    "Ranking": rfe_model.ranking_
})
print("\nFeature ranking:\n", features_ranking)


In [None]:
print(input_data.head())

# Missing Values

In [None]:
import pandas as pd
import numpy as np

filename = 'mammographic_masses.data'
col_names = ['BI-RADS','Age','Shape','Margin','Density','Severity']
data = pd.read_csv(filename, names=col_names)
data[data == '?'] = np.nan

print(data.isnull().sum())
print(data.isna().sum())
data.isnull()


# Data Transformation

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

filename = 'ILPD.csv'
col_names = ['age', 'gen', 'tbili', 'dbili', 'alkphos', 'sgpt',
             'sgot', 'tp', 'alb', 'ag', 'class']

# Load dataset
data = pd.read_csv(filename, names=col_names)

# Split input variables
input_data = data[data.columns[:-1]]
print("Before OneHotEncoder:\n", input_data.head())

# Define and apply OneHotEncoder to the 'gen' column
# Try using sparse_output (new versions), fallback to sparse (old versions)
try:
    encoder = OneHotEncoder(sparse_output=False)  # scikit-learn >= 1.2
except TypeError:
    encoder = OneHotEncoder(sparse=False)        # scikit-learn < 1.2

encoded = encoder.fit_transform(input_data[['gen']])

# Convert to DataFrame with proper column names
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['gen']))

# Merge numeric columns with dummy variables
input_data = pd.concat(
    [input_data.drop(columns=['gen']).reset_index(drop=True),
     encoded_df.reset_index(drop=True)], axis=1
)

print("\nAfter OneHotEncoder:\n", input_data.head())

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

filename = 'ILPD.csv'
col_names = ['age', 'gen', 'tbili', 'dbili', 'alkphos', 'sgpt',
             'sgot', 'tp', 'alb', 'ag', 'class']

# Load dataset
data = pd.read_csv(filename, names=col_names)

# Split input and output variables
X_raw = data[data.columns[:-1]]
Y = data['class']

print("Before OneHotEncoder:\n", X_raw.head())

# Apply OneHotEncoder to 'gen' column (compatible with different sklearn versions)
try:
    encoder = OneHotEncoder(sparse_output=False)  # scikit-learn >= 1.2
except TypeError:
    encoder = OneHotEncoder(sparse=False)        # scikit-learn < 1.2

encoded = encoder.fit_transform(X_raw[['gen']])

# Convert encoded array to DataFrame with proper column names
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['gen']))

# Merge numeric columns with dummy variables
X = pd.concat([X_raw.drop(columns=['gen']).reset_index(drop=True),
               encoded_df.reset_index(drop=True)], axis=1)

print("\nAfter OneHotEncoder:\n", X.head())

# Scale all variables with MinMaxScaler
minmaxSc = MinMaxScaler(feature_range=(0, 1))
rescX = minmaxSc.fit_transform(X)

# Convert scaled data back to DataFrame with same column names
rescX_df = pd.DataFrame(rescX, columns=X.columns)

print("\nAfter MinMaxScaler (DataFrame):\n", rescX_df.head())



In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler

filename = 'ILPD.csv'
col_names = ['age', 'gen', 'tbili', 'dbili', 'alkphos', 'sgpt',
             'sgot', 'tp', 'alb', 'ag', 'class']

# Load dataset
data = pd.read_csv(filename, names=col_names)

# Split input and output
X_raw = data[data.columns[:-1]]
Y = data['class'].values

# Apply OneHotEncoder to 'gen' column
try:
    encoder = OneHotEncoder(sparse_output=False)  # sklearn >= 1.2
except TypeError:
    encoder = OneHotEncoder(sparse=False)        # sklearn < 1.2

encoded = encoder.fit_transform(X_raw[['gen']])

# Convert encoded values into DataFrame
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(['gen']))

# Merge numerical and encoded categorical variables
X = pd.concat([X_raw.drop(columns=['gen']).reset_index(drop=True),
               encoded_df.reset_index(drop=True)], axis=1)

print("After OneHotEncoder:\n", X.head())

# Convert to numpy arrays
X_values = X.values

# StandardScaler
stdScaler = StandardScaler().fit(X_values)
rescX_std = stdScaler.transform(X_values)
print("\nAfter StandardScaler:\n", rescX_std[:5])

# RobustScaler
robScaler = RobustScaler().fit(X_values)
rescX_rob = robScaler.transform(X_values)
print("\nAfter RobustScaler:\n", rescX_rob[:5])

# Dimensionality Reduction

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA

iris = datasets.load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names

pca = PCA(n_components=2)
X_r = pca.fit(X).transform(X)
print('Explained variance: ', pca.explained_variance_ratio_)

plt.figure()
colors = ['indigo','lightseagreen','gold']
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
    plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw, label=target_name)
plt.legend(loc='best', scatterpoints=1)
plt.title('PCA of Iris dataset')
plt.show()

# Imbalance Treatment

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from numpy import where

# generate imbalanced dataset (2D for easy visualization)
X, y = make_classification(
    n_samples=10000, n_features=2, n_redundant=0, 
    n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1
)

# summarize class distribution
counter = Counter(y)
print("Before SMOTE:", counter)

# plot original dataset
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
for label, _ in counter.items():
    row_ix = where(y == label)
    plt.scatter(X[row_ix, 0], X[row_ix, 1], label=f"Class {label}", alpha=0.5)
plt.title("Before SMOTE")
plt.legend()

# apply SMOTE
oversample = SMOTE()
X_res, y_res = oversample.fit_resample(X, y)

# summarize new class distribution
counter_res = Counter(y_res)
print("After SMOTE:", counter_res)

# plot resampled dataset
plt.subplot(1, 2, 2)
for label, _ in counter_res.items():
    row_ix = where(y_res == label)
    plt.scatter(X_res[row_ix, 0], X_res[row_ix, 1], label=f"Class {label}", alpha=0.5)
plt.title("After SMOTE")
plt.legend()

plt.show()
