# Artificial Intelligence and Data Science in Operations Research
## Machine Learning Activity

In [1]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def train_svm(X_train, y_train, X_test, y_test, 
              C=1.0, kernel='rbf', degree=3, gamma='scale', 
              coef0=0.0, shrinking=True, probability=False, 
              tol=1e-3, cache_size=200, class_weight=None, 
              verbose=False, max_iter=-1, decision_function_shape='ovr', 
              break_ties=False, random_state=None):
    """
    Trains an SVM model using the provided training data and evaluates it on the test data.
    
    Parameters:
    - X_train: Training feature set
    - y_train: Training labels
    - X_test: Test feature set
    - y_test: Test labels
    - C: Regularization parameter (default=1.0)
    - kernel: Specifies the kernel type to be used in the algorithm (default='rbf')
    - degree: Degree of the polynomial kernel function (default=3)
    - gamma: Kernel coefficient (default='scale')
    - coef0: Independent term in kernel function (default=0.0)
    - shrinking: Whether to use the shrinking heuristic (default=True)
    - probability: Whether to enable probability estimates (default=False)
    - tol: Tolerance for stopping criterion (default=1e-3)
    - cache_size: Size of the kernel cache (default=200)
    - class_weight: Set the parameter C of class i to class_weight[i]*C (default=None)
    - verbose: Enable verbose output (default=False)
    - max_iter: Hard limit on iterations within solver (default=-1)
    - decision_function_shape: Whether to return a one-vs-rest ('ovr') decision function or not ('ovo') (default='ovr')
    - break_ties: If true, decision_function_shape='ovr' and number of classes > 2, predict will break ties (default=False)
    - random_state: Controls the pseudo random number generation for shuffling the data (default=None)
    
    Returns:
    - model: The trained SVM model
    - test_score: The accuracy score on the test set
    """
    
    # Initialize the SVM model with the provided hyperparameters
    model = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, 
                shrinking=shrinking, probability=probability, tol=tol, 
                cache_size=cache_size, class_weight=class_weight, verbose=verbose, 
                max_iter=max_iter, decision_function_shape=decision_function_shape, 
                break_ties=break_ties, random_state=random_state)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the accuracy score
    test_score = accuracy_score(y_test, y_pred)
    
    return model, test_score


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def train_logistic_regression(X_train, y_train, X_test, y_test, 
                              penalty='l2', dual=False, tol=1e-4, C=1.0, 
                              fit_intercept=True, intercept_scaling=1, 
                              class_weight=None, random_state=None, 
                              solver='lbfgs', max_iter=100, multi_class='auto', 
                              verbose=0, warm_start=False, n_jobs=None, l1_ratio=None):
    """
    Trains a Logistic Regression model using the provided training data and evaluates it on the test data.
    
    Parameters:
    - X_train: Training feature set
    - y_train: Training labels
    - X_test: Test feature set
    - y_test: Test labels
    - penalty: Used to specify the norm used in the penalization (default='l2')
    - dual: Dual or primal formulation (default=False)
    - tol: Tolerance for stopping criteria (default=1e-4)
    - C: Inverse of regularization strength (default=1.0)
    - fit_intercept: Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function (default=True)
    - intercept_scaling: Useful only when the solver ‘liblinear’ is used (default=1)
    - class_weight: Weights associated with classes (default=None)
    - random_state: Controls the random number generation for shuffling the data (default=None)
    - solver: Algorithm to use in the optimization problem (default='lbfgs')
    - max_iter: Maximum number of iterations taken for the solvers to converge (default=100)
    - multi_class: If the option chosen is ‘ovr’, then a binary problem is fit for each label (default='auto')
    - verbose: For the liblinear and lbfgs solvers set verbose to any positive number for verbosity (default=0)
    - warm_start: Reuse the solution of the previous call to fit as initialization (default=False)
    - n_jobs: Number of CPU cores used when parallelizing over classes if multi_class='ovr' (default=None)
    - l1_ratio: The Elastic-Net mixing parameter, with 0 <= l1_ratio <= 1 (default=None)
    
    Returns:
    - model: The trained Logistic Regression model
    - test_score: The accuracy score on the test set
    """
    
    # Initialize the Logistic Regression model with the provided hyperparameters
    model = LogisticRegression(penalty=penalty, dual=dual, tol=tol, C=C, 
                               fit_intercept=fit_intercept, intercept_scaling=intercept_scaling, 
                               class_weight=class_weight, random_state=random_state, solver=solver, 
                               max_iter=max_iter, multi_class=multi_class, verbose=verbose, 
                               warm_start=warm_start, n_jobs=n_jobs, l1_ratio=l1_ratio)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the accuracy score
    test_score = accuracy_score(y_test, y_pred)
    
    return model, test_score


In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def train_decision_tree(X_train, y_train, X_test, y_test, 
                        criterion='gini', splitter='best', max_depth=None, 
                        min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features=None, random_state=None, max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0):
    """
    Trains a Decision Tree model using the provided training data and evaluates it on the test data.
    
    Parameters:
    - X_train: Training feature set
    - y_train: Training labels
    - X_test: Test feature set
    - y_test: Test labels
    - criterion: The function to measure the quality of a split (default='gini')
    - splitter: The strategy used to choose the split at each node (default='best')
    - max_depth: The maximum depth of the tree (default=None)
    - min_samples_split: The minimum number of samples required to split an internal node (default=2)
    - min_samples_leaf: The minimum number of samples required to be at a leaf node (default=1)
    - min_weight_fraction_leaf: The minimum weighted fraction of the sum total of weights required to be at a leaf node (default=0.0)
    - max_features: The number of features to consider when looking for the best split (default=None)
    - random_state: Controls the randomness of the estimator (default=None)
    - max_leaf_nodes: Grow a tree with max_leaf_nodes in best-first fashion (default=None)
    - min_impurity_decrease: A node will be split if this split induces a decrease of the impurity greater than or equal to this value (default=0.0)
    - class_weight: Weights associated with classes (default=None)
    - ccp_alpha: Complexity parameter used for Minimal Cost-Complexity Pruning (default=0.0)
    
    Returns:
    - model: The trained Decision Tree model
    - test_score: The accuracy score on the test set
    """
    
    # Initialize the Decision Tree model with the provided hyperparameters
    model = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, 
                                   min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                                   min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, 
                                   random_state=random_state, max_leaf_nodes=max_leaf_nodes, 
                                   min_impurity_decrease=min_impurity_decrease, class_weight=class_weight, 
                                   ccp_alpha=ccp_alpha)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the accuracy score
    test_score = accuracy_score(y_test, y_pred)
    
    return model, test_score


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def train_random_forest(X_train, y_train, X_test, y_test, 
                        n_estimators=100, criterion='gini', max_depth=None, 
                        min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
                        bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
                        verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, 
                        max_samples=None):
    """
    Trains a Random Forest model using the provided training data and evaluates it on the test data.
    
    Parameters:
    - X_train: Training feature set
    - y_train: Training labels
    - X_test: Test feature set
    - y_test: Test labels
    - n_estimators: The number of trees in the forest (default=100)
    - criterion: The function to measure the quality of a split (default='gini')
    - max_depth: The maximum depth of the tree (default=None)
    - min_samples_split: The minimum number of samples required to split an internal node (default=2)
    - min_samples_leaf: The minimum number of samples required to be at a leaf node (default=1)
    - min_weight_fraction_leaf: The minimum weighted fraction of the sum total of weights required to be at a leaf node (default=0.0)
    - max_features: The number of features to consider when looking for the best split (default='auto')
    - max_leaf_nodes: Grow trees with max_leaf_nodes in best-first fashion (default=None)
    - min_impurity_decrease: A node will be split if this split induces a decrease of the impurity greater than or equal to this value (default=0.0)
    - bootstrap: Whether bootstrap samples are used when building trees (default=True)
    - oob_score: Whether to use out-of-bag samples to estimate the generalization accuracy (default=False)
    - n_jobs: The number of jobs to run in parallel (default=None)
    - random_state: Controls the randomness of the estimator (default=None)
    - verbose: Controls the verbosity when fitting and predicting (default=0)
    - warm_start: When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble (default=False)
    - class_weight: Weights associated with classes (default=None)
    - ccp_alpha: Complexity parameter used for Minimal Cost-Complexity Pruning (default=0.0)
    - max_samples: If bootstrap is True, the number of samples to draw from X to train each base estimator (default=None)
    
    Returns:
    - model: The trained Random Forest model
    - test_score: The accuracy score on the test set
    """
    
    # Initialize the Random Forest model with the provided hyperparameters
    model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, 
                                   min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                                   min_weight_fraction_leaf=min_weight_fraction_leaf, max_features=max_features, 
                                   max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=min_impurity_decrease, 
                                   bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, 
                                   verbose=verbose, warm_start=warm_start, class_weight=class_weight, 
                                   ccp_alpha=ccp_alpha, max_samples=max_samples)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the accuracy score
    test_score = accuracy_score(y_test, y_pred)
    
    return model, test_score


In [5]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)


X_train shape: (120, 4)
y_train shape: (120,)
X_test shape: (30, 4)
y_test shape: (30,)


In [6]:
model, score = train_logistic_regression(X_train, y_train, X_test, y_test, solver='liblinear', max_iter=200)
print(f"Test Accuracy: {score}")

Test Accuracy: 1.0
