# Part 1: Midterm

In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



In [None]:
# Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/linear-m07.csv')
df.head()

Unnamed: 0,X,Y
0,26.702827,107.300877
1,87.862999,311.338166
2,79.742602,291.769926
3,65.845183,237.408745
4,85.058173,321.865779


In [None]:
# Define X and Y
X = df[['X']] # Independent variable (features)
Y = df['Y']   # Dependent variable (target)
print("\nShape of X:", X.shape)
print("Shape of Y:", Y.shape)


Shape of X: (500, 1)
Shape of Y: (500,)


In [None]:
# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=101)
print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)


Shape of X_train: (375, 1)
Shape of X_test: (125, 1)
Shape of Y_train: (375,)
Shape of Y_test: (125,)


In [None]:
# Model Training
model = LinearRegression()
model.fit(X_train, Y_train)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
# 1. What is the slope of the line? (2 decimal places, no rounding)
slope = model.coef_[0]
print(f"\n1. The slope of the line: {slope:}")



1. The slope of the line: 3.4838649291733614


In [None]:
# 2. What is the RMSE of the model based on test data? (2 decimal places, no rounding)
rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))
print(f"2. The RMSE of the model based on test data: {rmse:}")


2. The RMSE of the model based on test data: 19.139392758345046


In [None]:
# Cell 8: Prediction for X = 38.85
X_new = np.array([[38.85]])
Y_new_pred = model.predict(X_new)
print(f"\n3. If X is 38.85, the predicted Y value is: {Y_new_pred[0]:}")


3. If X is 38.85, the predicted Y value is: 148.11113880887834


# Finals


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
df2 = pd.read_csv('https://raw.githubusercontent.com/renatomaaliw3/public_files/refs/heads/master/Data%20Sets/classification-f07.csv')
print("Dataset Head:")
print(df2.head())
print("\nInitial Missing Values:")
print(df2.isnull().sum())

Dataset Head:
           A          B          C  Z
0  40.181969  39.828174  43.449460  0
1  16.065769  28.541647  14.831670  1
2  26.029840  26.776941  32.934835  1
3  31.731555  21.483142  40.106356  0
4  34.707991  30.612042  40.819845  0

Initial Missing Values:
A    5
B    5
C    5
Z    0
dtype: int64


In [None]:
# Handle Missing Data (Replace with Mean)
df2.fillna(df2.mean(numeric_only=True), inplace=True)
print("\nMissing Values After Imputation:")
print(df2.isnull().sum())
print("\nDataset Head After Imputation:")
print(df2.head())


Missing Values After Imputation:
A    0
B    0
C    0
Z    0
dtype: int64

Dataset Head After Imputation:
           A          B          C  Z
0  40.181969  39.828174  43.449460  0
1  16.065769  28.541647  14.831670  1
2  26.029840  26.776941  32.934835  1
3  31.731555  21.483142  40.106356  0
4  34.707991  30.612042  40.819845  0


In [None]:
# Define X and Y
X = df2[['A', 'B', 'C']] # Features
Y = df2['Z']             # Target variable
print("\nShape of X:", X.shape)
print("Shape of Y:", Y.shape)


Shape of X: (500, 3)
Shape of Y: (500,)


In [None]:
# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=101)
print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)


Shape of X_train: (375, 3)
Shape of X_test: (125, 3)
Shape of Y_train: (375,)
Shape of Y_test: (125,)


In [None]:
# Support Vector Machine (SVM)
print("\n--- Training SVM Model ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define param_grid for GridSearchCV
param_grid_svm = {'C': [0.01, 0.05, 0.1, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5],
                  'kernel': ['linear', 'rbf', 'poly']}

svm_grid_search = GridSearchCV(SVC(random_state=101), param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
svm_grid_search.fit(X_train_scaled, Y_train)

best_svm_model = svm_grid_search.best_estimator_
svm_predictions = best_svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(Y_test, svm_predictions)

print(f"SVM Best Parameters: {svm_grid_search.best_params_}")
print(f"SVM Best Cross-Validation Accuracy: {svm_grid_search.best_score_:.4f}")
print(f"SVM Test Accuracy: {svm_accuracy:.4f}")


--- Training SVM Model ---
SVM Best Parameters: {'C': 0.5, 'kernel': 'rbf'}
SVM Best Cross-Validation Accuracy: 0.9893
SVM Test Accuracy: 0.9920


In [None]:
# Decision Tree (DT)
print("\n--- Training Decision Tree Model ---")
dt_model = DecisionTreeClassifier(random_state=101)
dt_model.fit(X_train, Y_train)
dt_predictions = dt_model.predict(X_test)
dt_accuracy = accuracy_score(Y_test, dt_predictions)
print(f"Decision Tree Test Accuracy: {dt_accuracy:.4f}")


--- Training Decision Tree Model ---
Decision Tree Test Accuracy: 0.9760


In [None]:
# Random Forest (RF)
print("\n--- Training Random Forest Model ---")
# Grid Search Values for RF
param_grid_rf = {
    'n_estimators': [11, 25, 65, 101, 151, 201, 301, 401],
    'max_features': [2, 3]
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=101), param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
rf_grid_search.fit(X_train, Y_train)

best_rf_model = rf_grid_search.best_estimator_
rf_predictions = best_rf_model.predict(X_test)
rf_accuracy = accuracy_score(Y_test, rf_predictions)

print(f"Random Forest Best Parameters: {rf_grid_search.best_params_}")
print(f"Random Forest Best Cross-Validation Accuracy: {rf_grid_search.best_score_:.4f}")
print(f"Random Forest Test Accuracy: {rf_accuracy:.4f}")


--- Training Random Forest Model ---
Random Forest Best Parameters: {'max_features': 2, 'n_estimators': 65}
Random Forest Best Cross-Validation Accuracy: 0.9707
Random Forest Test Accuracy: 0.9520


In [None]:
# Compare Models and Determine Best Model
print("\n--- Model Comparison ---")
accuracies = {
    'SVM': svm_accuracy,
    'DT': dt_accuracy,
    'RF': rf_accuracy
}

best_model_name = max(accuracies, key=accuracies.get)
print(f"\nAccuracy Scores:")
for model, acc in accuracies.items():
    print(f"- {model}: {acc:.4f}")
print(f"\nBest Model based on accuracy: {best_model_name}")


--- Model Comparison ---

Accuracy Scores:
- SVM: 0.9600
- DT: 0.9440
- RF: 0.9520

Best Model based on accuracy: SVM


In [None]:
# Answer Questions based on the Best Model
print("\n--- Answers to Questions ---")

# Question 4: What is the best model based on accuracy results?
print(f"4. The best model based on accuracy results is: {best_model_name}")

if best_model_name == 'SVM':
    optimal_C = svm_grid_search.best_params_['C']
    best_kernel = svm_grid_search.best_params_ #['kernel']
    print(f"5. If your best model is SVM, the optimized value of C is: {optimal_C}")
    print(f"6. If your best model is SVM, the best kernel is: '{best_kernel}'")

    # Prediction for [30, 25, 35] using the best SVM model (remember to scale)
    new_data_svm = np.array([[30, 25, 35]])
    new_data_svm_scaled = scaler.transform(new_data_svm)
    prediction_svm = best_svm_model.predict(new_data_svm_scaled)[0]
    print(f"7. Your prediction for the data [30, 25, 35] (0 or 1) using SVM is: {prediction_svm}")

elif best_model_name == 'DT':
    # DT specific answers
    feature_importances_dt = dt_model.feature_importances_
    features = X.columns
    sorted_features_dt = sorted(zip(feature_importances_dt, features), reverse=True)

    root_node_dt = sorted_features_dt[0][1] # Most important feature
    least_contributor_dt = sorted_features_dt[-1][1] # Least important feature

    print(f"5. If your best model is DT, the variable that is the root node is: '{root_node_dt}'")
    print(f"6. If your best model is DT, the variable that is the least contributor to prediction is: '{least_contributor_dt}'")

    # Prediction for [35, 30, 25] using the DT model
    new_data_dt = np.array([[35, 30, 25]])
    prediction_dt = dt_model.predict(new_data_dt)[0]
    print(f"7. Your prediction for the data [35, 30, 25] (0 or 1) using DT is: {prediction_dt}")

elif best_model_name == 'RF':
    # RF specific answers
    feature_importances_rf = best_rf_model.feature_importances_
    features = X.columns
    sorted_features_rf = sorted(zip(feature_importances_rf, features), reverse=True)

    mostly_root_node_rf = sorted_features_rf[0][1] # Most important feature
    least_contributor_rf = sorted_features_rf[-1][1] # Least important feature

    print(f"5. If your best model is RF, the variable that is [mostly] the root node is: '{mostly_root_node_rf}'")
    print(f"6. If your best model is RF, the variable that is the least contributor to prediction is: '{least_contributor_rf}'")

    # Prediction for [35, 25, 30] using the best RF model
    new_data_rf = np.array([[35, 25, 30]])
    prediction_rf = best_rf_model.predict(new_data_rf)[0]
    print(f"7. Your prediction for the data [35, 25, 30] (0 or 1) using RF is: {prediction_rf}")



--- Answers to Questions ---
4. The best model based on accuracy results is: SVM
5. If your best model is SVM, the optimized value of C is: 4.5
6. If your best model is SVM, the best kernel is: '{'C': 4.5, 'kernel': 'rbf'}'
7. Your prediction for the data [30, 25, 35] (0 or 1) using SVM is: 1
