# **Main Assignment:**

## Write the complete code to select the best Regressor and classifier for the given dataset called diamonds `(if you have a high end machine, you can use the whole dataset, else use the sample dataset provided )` or you can use Tips datset for Regression task and Iris dataset for Classification task.

## You have to choose all possible models with their best or possible hyperparameters and compare them with each other and select the best model for the given dataset.

## Your code should be complete and explained properly. for layman, each and every step of the code should be commented properly.

## You code should also save the best model in the pickle file.

## You should also write the code to load the pickle file and use it for prediction. in the last snippet of the code.

In [None]:
# Classification using Iris dataset with multiple models

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import pickle

# Load Iris dataset
iris = load_iris(as_frame=True)
X = iris.data
y = iris.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

# Preprocessing (only numeric features)
clf_preprocessor = ColumnTransformer([
    ('num', StandardScaler(), X.columns)
])

# Classification models
clf_models = {
    'LogisticRegression': LogisticRegression(max_iter=200),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVC': SVC(),
    'NaiveBayes': GaussianNB()
}

best_clf_model = None
best_clf_score = float('-inf')
best_clf_name = ""

# Evaluate all classifiers
for name, model in clf_models.items():
    clf_pipeline = Pipeline([
        ('preprocessor', clf_preprocessor),
        ('classifier', model)
    ])
    scores = cross_val_score(clf_pipeline, X_train, y_train, scoring='accuracy', cv=5)
    mean_score = scores.mean()
    print(f"{name} Accuracy (CV): {mean_score:.4f}")
    if mean_score > best_clf_score:
        best_clf_score = mean_score
        best_clf_model = clf_pipeline
        best_clf_name = name

# Train best model
best_clf_model.fit(X_train, y_train)

# Evaluate on test data
y_pred = best_clf_model.predict(X_test)
print(f"\nBest Classification Model: {best_clf_name}")
print("Test Accuracy:", accuracy_score(y_test, y_pred))

# Save best classifier model
with open('best_iris_classifier.pkl', 'wb') as f:
    pickle.dump(best_clf_model, f)

# # Load and predict
# with open('best_iris_classifier.pkl', 'rb') as f:
#     loaded_clf_model = pickle.load(f)

# # Predict on a few samples
# sample_clf_input = X_test.iloc[:5]
# sample_clf_pred = loaded_clf_model.predict(sample_clf_input)
# sample_clf_pred


In [None]:
# Load and predict
with open("best_iris_classifier.pkl", "rb") as f:
    loaded_clf_model = pickle.load(f)

# Predict on a few samples
sample_clf_input = X_test.iloc[:5]
sample_clf_pred = loaded_clf_model.predict(sample_clf_input)
sample_clf_pred
accuracy_score(y_test.iloc[:5], sample_clf_pred)

In [None]:
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

data = sns.load_dataset('Diamonds')
data.shape
data = data[:1000]
data
data['clarity'].value_counts()
# Label encode categorical variables
le = LabelEncoder()
data['cut'] = le.fit_transform(data['cut'])
data['color'] = le.fit_transform(data['color'])
data['clarity'] = le.fit_transform(data['clarity'])
data

X = data.drop('price', axis=1)
y = data['price']
X.columns

In [None]:
# Classification using Iris dataset with multiple models
import warnings

warnings.filterwarnings("ignore")
from sklearn.datasets import load_iris
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, r2_score
import pickle
from sklearn.preprocessing import LabelEncoder


data = sns.load_dataset("Diamonds")
data.shape
data = data[:1000]

le = LabelEncoder()
data["cut"] = le.fit_transform(data["cut"])
data["color"] = le.fit_transform(data["color"])
data["clarity"] = le.fit_transform(data["clarity"])


X = data.drop("price", axis=1)
y = data["price"]


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=42
)

# Preprocessing (only numeric features)
clf_preprocessor = ColumnTransformer([("num", StandardScaler(), X.columns)])

# Classification models
clf_models = {
    "LogisticRegression": LinearRegression(),
    "DecisionTree": DecisionTreeRegressor(max_depth=5),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "SVC": SVR(),
    "NaiveBayes": GaussianNB(),
}

best_clf_model = None
best_clf_score = float("-inf")
best_clf_name = ""

# Evaluate all classifiers
for name, model in clf_models.items():
    clf_pipeline = Pipeline([("preprocessor", clf_preprocessor), ("classifier", model)])
    scores = cross_val_score(clf_pipeline, X_train, y_train, scoring="r2", cv=5)
    mean_score = scores.mean()
    print(f"{name} Accuracy (CV): {mean_score:.4f}")
    if mean_score > best_clf_score:
        best_clf_score = mean_score
        best_clf_model = clf_pipeline
        best_clf_name = name

# Train best model
best_clf_model.fit(X_train, y_train)

# Evaluate on test data
y_pred = best_clf_model.predict(X_test)
print(f"\nBest Classification Model: {best_clf_name}")
print("Test Accuracy:", r2_score(y_test, y_pred))

# Save best classifier model
with open("best_iris_classifier.pkl", "wb") as f:
    pickle.dump(best_clf_model, f)

# # Load and predict
# with open('best_iris_classifier.pkl', 'rb') as f:
#     loaded_clf_model = pickle.load(f)

# # Predict on a few samples
# sample_clf_input = X_test.iloc[:5]
# sample_clf_pred = loaded_clf_model.predict(sample_clf_input)
# sample_clf_pred