<a href="https://colab.research.google.com/github/gulshan0201/DATA-Science/blob/main/ML_LAB_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score

# 1) Generate complex dataset
X, y = make_classification(
    n_samples=5000, n_features=200, n_informative=30, n_redundant=30, n_repeated=10,
    n_classes=3, weights=[0.6,0.3,0.1], flip_y=0.02, random_state=42
)

# Inject heterogeneous scales and outliers
X[:, 0:50] *= 0.1
X[:, 50:100] *= 10
X[:, 100:150] *= 100
rng = np.random.RandomState(42)
outlier_cols = slice(150, 170)
X[:, outlier_cols] += rng.standard_t(df=2, size=X[:, outlier_cols].shape) * 20

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)

# 2) Define scalers and models
scalers = {
    "None": None,
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler()
}

models = {
    "KNN(k=11)": KNeighborsClassifier(n_neighbors=11),
    "SVM(RBF)": SVC(kernel='rbf', C=10, gamma='scale'),
    "LogisticRegression": LogisticRegression(max_iter=3000, multi_class='multinomial'),
    "DecisionTree": DecisionTreeClassifier(max_depth=None, random_state=42)
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scoring = {'acc': make_scorer(accuracy_score), 'f1': make_scorer(f1_score, average='macro')}

def evaluate_combo(scaler_name, scaler, model_name, model):
    steps = []
    if scaler is not None:
        steps.append(('scaler', scaler))
    steps.append(('model', model))
    pipe = Pipeline(steps)
    cvres = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    return {
        'Scaler': scaler_name,
        'Model': model_name,
        'CV_Acc_Mean': np.mean(cvres['test_acc']),
        'CV_F1_Mean': np.mean(cvres['test_f1'])
    }

results = []
for s_name, scaler in scalers.items():
    for m_name, model in models.items():
        results.append(evaluate_combo(s_name, scaler, m_name, model))

# Print sorted by Macro-F1
results_sorted = sorted(results, key=lambda d: d['CV_F1_Mean'], reverse=True)
for r in results_sorted:
    print(r)


{'Scaler': 'MinMaxScaler', 'Model': 'SVM(RBF)', 'CV_Acc_Mean': np.float64(0.844), 'CV_F1_Mean': np.float64(0.7562210596836134)}
{'Scaler': 'StandardScaler', 'Model': 'SVM(RBF)', 'CV_Acc_Mean': np.float64(0.8394999999999999), 'CV_F1_Mean': np.float64(0.7423688941292339)}
{'Scaler': 'RobustScaler', 'Model': 'SVM(RBF)', 'CV_Acc_Mean': np.float64(0.8089999999999999), 'CV_F1_Mean': np.float64(0.7085499342651727)}
{'Scaler': 'None', 'Model': 'SVM(RBF)', 'CV_Acc_Mean': np.float64(0.7835), 'CV_F1_Mean': np.float64(0.6950960269368792)}
{'Scaler': 'None', 'Model': 'LogisticRegression', 'CV_Acc_Mean': np.float64(0.7384999999999999), 'CV_F1_Mean': np.float64(0.6298296903536219)}
{'Scaler': 'StandardScaler', 'Model': 'LogisticRegression', 'CV_Acc_Mean': np.float64(0.73125), 'CV_F1_Mean': np.float64(0.616696113507028)}
{'Scaler': 'RobustScaler', 'Model': 'LogisticRegression', 'CV_Acc_Mean': np.float64(0.7305), 'CV_F1_Mean': np.float64(0.615353741718254)}
{'Scaler': 'MinMaxScaler', 'Model': 'Logistic

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, accuracy_score, f1_score


# 1) Load dataset
file_path = "steam_sales.csv"
df = pd.read_csv(file_path)



In [7]:
#Remove all duplicate values
df = df.drop_duplicates()

In [8]:
# Fill missing values
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

In [15]:
# Clean #Reviews column (remove commas and convert to int)
df['#Reviews'] = df['#Reviews'].astype(str).str.replace(',', '').str.split('.').str[0].astype(int)

In [16]:
# 3) Define Features (X) and Target (y)
y = df['Rating']

# Select numeric features
X = df[['#Reviews', 'Discount%', 'Price (€)', 'Original Price (€)', 'Windows', 'Linux', 'MacOS']]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)


In [18]:
# 4) Define scalers and models
scalers = {
    "None": None,
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler()
}

models = {
    "KNN(k=11)": KNeighborsClassifier(n_neighbors=11),
    "SVM(RBF)": SVC(kernel='rbf', C=10, gamma='scale'),
    "LogisticRegression": LogisticRegression(max_iter=3000, multi_class='multinomial'),
    "DecisionTree": DecisionTreeClassifier(max_depth=None, random_state=42)
}

In [19]:
# 5) Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scoring = {'acc': make_scorer(accuracy_score), 'f1': make_scorer(f1_score, average='macro')}

def evaluate_combo(scaler_name, scaler, model_name, model):
    steps = []
    if scaler is not None:
        steps.append(('scaler', scaler))
    steps.append(('model', model))
    pipe = Pipeline(steps)
    cvres = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)
    return {
        'Scaler': scaler_name,
        'Model': model_name,
        'CV_Acc_Mean': np.mean(cvres['test_acc']),
        'CV_F1_Mean': np.mean(cvres['test_f1'])
    }

In [20]:
# 6) Run experiments
results = []
for s_name, scaler in scalers.items():
    for m_name, model in models.items():
        results.append(evaluate_combo(s_name, scaler, m_name, model))

# Sort by F1 score
results_sorted = sorted(results, key=lambda d: d['CV_F1_Mean'], reverse=True)

# Convert to DataFrame for better presentation
results_df = pd.DataFrame(results_sorted)
print(results_df)

            Scaler               Model  CV_Acc_Mean  CV_F1_Mean
0   StandardScaler        DecisionTree     0.549656    0.497867
1             None        DecisionTree     0.548787    0.496900
2     MinMaxScaler        DecisionTree     0.547917    0.496441
3     RobustScaler        DecisionTree     0.547048    0.488312
4             None           KNN(k=11)     0.532254    0.288264
5     RobustScaler           KNN(k=11)     0.540042    0.224314
6   StandardScaler            SVM(RBF)     0.559214    0.217865
7     MinMaxScaler           KNN(k=11)     0.545278    0.203766
8   StandardScaler           KNN(k=11)     0.539203    0.198396
9     MinMaxScaler            SVM(RBF)     0.563596    0.196277
10  StandardScaler  LogisticRegression     0.564469    0.193195
11            None  LogisticRegression     0.564462    0.185597
12    RobustScaler            SVM(RBF)     0.564458    0.157291
13    RobustScaler  LogisticRegression     0.562726    0.131306
14            None            SVM(RBF)  

In [21]:
# Make a perturbed copy of the dataset
X_perturbed = X.copy()
X_perturbed['#Reviews'] = X_perturbed['#Reviews'] * 1000
X_perturbed['Discount%'] = X_perturbed['Discount%'] * 0.01

# Train-test split for perturbed data
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_perturbed, y, stratify=y, test_size=0.2, random_state=0
)

# Evaluate perturbed dataset
results_perturbed = []
for s_name, scaler in scalers.items():
    for m_name, model in models.items():
        results_perturbed.append(evaluate_combo(s_name, scaler, m_name, model))

results_perturbed_sorted = sorted(results_perturbed, key=lambda d: d['CV_F1_Mean'], reverse=True)
results_perturbed_df = pd.DataFrame(results_perturbed_sorted)

# Compare with original results
comparison_df = results_df.merge(
    results_perturbed_df,
    on=['Scaler', 'Model'],
    suffixes=('_Original', '_Perturbed')
)

# Calculate sensitivity (absolute drop in F1)
comparison_df['F1_Drop'] = comparison_df['CV_F1_Mean_Original'] - comparison_df['CV_F1_Mean_Perturbed']

# Sort by sensitivity
sensitivity_df = comparison_df.sort_values(by='F1_Drop', ascending=False)

print(sensitivity_df[['Scaler', 'Model', 'CV_F1_Mean_Original', 'CV_F1_Mean_Perturbed', 'F1_Drop']])


            Scaler               Model  CV_F1_Mean_Original  \
0   StandardScaler        DecisionTree             0.497867   
1             None        DecisionTree             0.496900   
2     MinMaxScaler        DecisionTree             0.496441   
3     RobustScaler        DecisionTree             0.488312   
4             None           KNN(k=11)             0.288264   
5     RobustScaler           KNN(k=11)             0.224314   
6   StandardScaler            SVM(RBF)             0.217865   
7     MinMaxScaler           KNN(k=11)             0.203766   
8   StandardScaler           KNN(k=11)             0.198396   
9     MinMaxScaler            SVM(RBF)             0.196277   
10  StandardScaler  LogisticRegression             0.193195   
11            None  LogisticRegression             0.185597   
12    RobustScaler            SVM(RBF)             0.157291   
13    RobustScaler  LogisticRegression             0.131306   
14            None            SVM(RBF)             0.12