# Classification decision tree model

In [7]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# Decision Tree Classifier (with Gini impurity)
class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=5):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.classes = np.unique(y)
        y = np.array([np.where(self.classes == label)[0][0] for label in y])
        self.tree = self._grow_tree(X, y)
    
    def predict(self, X):
        predictions = [self._predict(inputs, self.tree) for inputs in X]
        return np.array([self.classes[pred] for pred in predictions])

    def _gini(self, y):
        m = len(y)
        return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in np.unique(y))

    def _split(self, X, y, idx, thresh):
        left_mask = X[:, idx] <= thresh
        right_mask = X[:, idx] > thresh
        return X[left_mask], X[right_mask], y[left_mask], y[right_mask]

    def _best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None

        best_gini = 1.0
        best_idx, best_thresh = None, None
        unique_classes = np.unique(y)
        class_count = len(unique_classes)

        for idx in range(n):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            thresholds = np.array(thresholds, dtype=np.float64)
            num_left = [0] * class_count
            num_right = [np.sum(classes == c) for c in unique_classes]

            for i in range(1, m):
                class_idx = np.where(unique_classes == classes[i - 1])[0][0]
                num_left[class_idx] += 1
                num_right[class_idx] -= 1

                gini_left = 1.0 - sum((num_left[x] / i) ** 2 for x in range(class_count))
                gini_right = 1.0 - sum((num_right[x] / (m - i)) ** 2 for x in range(class_count))
                gini = (i * gini_left + (m - i) * gini_right) / m

                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thresh = (thresholds[i] + thresholds[i - 1]) / 2
        
        return best_idx, best_thresh

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in np.unique(y)]
        predicted_class = np.argmax(num_samples_per_class)
        node = {'predicted_class': predicted_class}

        if depth < self.max_depth:
            idx, thresh = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] <= thresh
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node['feature_index'] = idx
                node['threshold'] = thresh
                node['left'] = self._grow_tree(X_left, y_left, depth + 1)
                node['right'] = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _predict(self, inputs, tree):
        if 'threshold' in tree:
            feature_index = tree['feature_index']
            if inputs[feature_index] <= tree['threshold']:
                return self._predict(inputs, tree['left'])
            else:
                return self._predict(inputs, tree['right'])
        else:
            return tree['predicted_class']




# Regressor decision tree model

In [8]:
import numpy as np

# from Models.ClassifierDT import DecisionTree

# Decision Tree Regressor (with SSR)
class DecisionTreeRegressor(DecisionTree):
    # Initialize the Decision Tree Regressor that inherits from the Decision Tree Classifier
    def _ssr(self, y):
        if len(y) == 0:
            return 0
        mean_y = np.mean(y)
        # Calculate the sum of the squared residuals and return it
        return np.sum((y - mean_y) ** 2)

    # Find the best split for the data based on the sum of the squared residuals
    def _best_split(self, X, y):

        # Number of samples (m) and number of features (n)
        m, n = X.shape

        # If the number of samples is less than or equal to the minimum number of samples required to split, return None
        if m <= 1:
            return None, None

        # Initialize the sum of the squared residuals of the best split to infinity and the best feature and threshold to None
        best_ssr = np.inf
        best_idx, best_thresh = None, None

        # For each feature
        for idx in range(n):
            # Get the thresholds and values
            thresholds, values = zip(*sorted(zip(X[:, idx], y)))
            # For each sample
            for i in range(1, m):
                # Get the left and right subsets of the target variable
                y_left, y_right = values[:i], values[i:]
                ssr_left, ssr_right = self._ssr(y_left), self._ssr(y_right)

                # Calculate the sum of the squared residuals of the left and right subsets
                ssr = ssr_left + ssr_right

                # Update the best split if the current split has a lower sum of the squared residuals
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if ssr < best_ssr:
                    best_ssr = ssr
                    best_idx = idx
                    best_thresh = (thresholds[i] + thresholds[i - 1]) / 2

        # Return the best feature and threshold
        return best_idx, best_thresh

# Random forest model

In [10]:
import numpy as np
# from Models.ClassifierDT import DecisionTree


class RandomForest:
    def __init__(self, n_trees=100, max_depth=5, min_samples_split=2, n_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_features = n_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples, n_features = X.shape
        self.n_features = self.n_features or n_features
        for _ in range(self.n_trees):
            idxs = np.random.choice(n_samples, n_samples, replace=True)
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X[idxs], y[idxs])
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.array([np.bincount(tree_preds[:, i]).argmax() for i in range(tree_preds.shape[1])])

# Random forest imputation for missing values in the dataset

In [14]:
import pandas as pd
from tqdm import tqdm

# Imputation function
def random_forest_impute(data, categorical_columns, n_trees=100, max_depth=5, min_samples_split=2, n_features=None, max_iter=10, tol=1e-3):
    data_imputed = data.copy()
    missing_mask = data.isnull()
    n_features = n_features or data.shape[1]
    
    # Temporarily fill NaNs with a placeholder value
    placeholder = "missing"
    data_imputed[categorical_columns] = data_imputed[categorical_columns].fillna(placeholder)
    
    # One-hot encode categorical variables, without creating separate column for NaN
    data_imputed = pd.get_dummies(data_imputed, columns=categorical_columns, dummy_na=False)
    
    print("Starting imputation process...")
    
    for iteration in tqdm(range(max_iter)):
        print(f"Iteration {iteration + 1}/{max_iter}")
        prev_data = data_imputed.copy()
        for column in categorical_columns:
            print(f"  Imputing column: {column}")
            # Identify the one-hot encoded columns for the current categorical column
            cat_columns = [col for col in data_imputed.columns if col.startswith(column + '_')]
            
            for cat_column in cat_columns:
                # Extract the original column name (without the one-hot suffix)
                original_column = cat_column.split('_')[0]
                
                # Handle the placeholder column separately
                if cat_column.endswith('_' + placeholder):
                    cat_column_name = original_column + '_missing'
                    missing_idx = data_imputed[cat_column_name]
                else:
                    cat_column_name = cat_column
                    missing_idx = data_imputed[cat_column_name].isnull()
                
                if missing_idx.any():
                    print(f"    Training RandomForest for {cat_column_name}...")
                    # Prepare training data
                    X_train = data_imputed[~missing_idx].drop(columns=cat_columns)
                    y_train = data_imputed[~missing_idx][cat_column_name]
                    
                    # Prepare data for imputation
                    X_impute = data_imputed[missing_idx].drop(columns=cat_columns)
                    
                    # Train Random Forest
                    rf = RandomForest(n_trees=n_trees, max_depth=max_depth, min_samples_split=min_samples_split, n_features=n_features)
                    rf.fit(X_train.values, y_train.values)
                    
                    # Predict missing values
                    data_imputed.loc[missing_idx, cat_column_name] = rf.predict(X_impute.values)
        
        # Check for convergence (i.e., no change in imputed values)
        change = np.linalg.norm(data_imputed.values - prev_data.values)
        print(f"  Change in data: {change}")
        if change < tol:
            print("Convergence reached.")
            break

    print("Reversing one-hot encoding...")
    # Reverse One-Hot Encoding
    for column in categorical_columns:
        cat_columns = [col for col in data_imputed.columns if col.startswith(column + '_')]
        data_imputed[column] = data_imputed[cat_columns].idxmax(axis=1).apply(lambda x: x.split('_')[1])
        data_imputed.drop(columns=cat_columns, inplace=True)
    
    print("Imputation completed.")
    return data_imputed



# Random forest regressor model

In [12]:
import numpy as np
# from Models.ClassifierDT import DecisionTree
# from Models.RandomForest import RandomForest

# Random Forest Regressor (with averaging)
class RandomForestRegressor(RandomForest):
    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

# Data Loading

In [15]:
# Import the necessary libraries and modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from Models.ClassifierDT import DecisionTree
# from Models.RegressionDT import DecisionTreeRegressor
# from Models.RandomForest import RandomForest
# from Models.RandomForestReg import RandomForestRegressor


import warnings
warnings.filterwarnings('ignore')

######### Load the dataset
 
# Load the dataset
data = pd.read_csv('/Users/edeneldar/Library/Mobile Documents/com~apple~CloudDocs/ML learn.worktrees/origin/mainEden/assignment1/assignment-1-data.csv')

# Rename the columns for easier access
data = data[['Brand', 'Screen_Size', 'RAM', 'Processor', 'GPU', 'GPU_Type', 'Resolution', 'Condition', 'Price']]

# Rename the columns for easier access
data.columns = ['Brand', 'Screen-Size', 'RAM', 'Processor', 'GPU', 'GPU-Type', 'Resolution', 'Condition', 'Price']

# Specify the categorical columns
categorical_columns = ['Brand', 'Processor', 'GPU', 'GPU-Type', 'Resolution', 'Condition']

print("Data loaded successfully, starting imputation")

# Impute missing values using Random Forest
imputed_data = random_forest_impute(data, categorical_columns, n_trees=100, max_depth=5, min_samples_split=2, n_features=None, max_iter=10, tol=1e-3)

print("Data imputed successfully")

imputed_data.to_csv('/tmp/imputed_data.csv', index=False)

import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase

from email import encoders

# send the csv file to the email
fromaddr = "edenstream988@gmail.com"
password = "xvwd qwqs ngev sbmd"
toaddr = "edenede2@gmail.com"

msg = MIMEMultipart()

msg['From'] = fromaddr
msg['To'] = toaddr
msg['Subject'] = "Imputed data"

body = "Imputed data"

msg.attach(MIMEText(body, 'plain'))

filename = "imputed_data.csv"
attachment = open("/tmp/imputed_data.csv", "rb")

p = MIMEBase('application', 'octet-stream')
p.set_payload((attachment).read())
encoders.encode_base64(p)
p.add_header('Content-Disposition', "attachment; filename= %s" % filename)

msg.attach(p)

s = smtplib.SMTP('smtp.gmail.com', 587)
s.starttls()
s.login(fromaddr, password)
text = msg.as_string()
s.sendmail(fromaddr, toaddr, text)
s.quit()

Data loaded successfully, starting imputation
Starting imputation process...


  0%|          | 0/10 [00:00<?, ?it/s]

Iteration 1/10
  Imputing column: Brand
  Imputing column: Processor
  Imputing column: GPU
    Training RandomForest for GPU_missing...
  Imputing column: GPU-Type
    Training RandomForest for GPU-Type_missing...
  Imputing column: Resolution
    Training RandomForest for Resolution_missing...


 10%|█         | 1/10 [30:02<4:30:23, 1802.64s/it]

  Imputing column: Condition
  Change in data: 31.11269837220809
Iteration 2/10
  Imputing column: Brand
  Imputing column: Processor
  Imputing column: GPU
  Imputing column: GPU-Type
  Imputing column: Resolution
  Imputing column: Condition
  Change in data: 0.0
Convergence reached.
Reversing one-hot encoding...
Imputation completed.
Data imputed successfully





(221,
 b'2.0.0 closing connection 5b1f17b1804b1-422874de68asm92949075e9.29 - gsmtp')

# Missing values imputation with random forest

# Data Preprocessing

In [17]:

# Correct the values of the 'Condition' column to 'New' and 'Refurbished'
imputed_data['Condition'] = data['Condition'].apply(lambda x: 'New' if x == 'New' or x == 'Open box' else 'Refurbished')

# Convert categorical features to numerical values using one-hot encoding
imputed_data = pd.get_dummies(imputed_data, columns=['Brand', 'Processor', 'GPU', 'GPU-Type', 'Resolution'])

reg_data = pd.get_dummies(imputed_data, columns=['Condition'])


# Split the data into training and validation sets

In [18]:
######### Split the data

# Split the data
train_data = imputed_data.iloc[0:2058]
val_data = imputed_data.iloc[2058:2499]
test_data = imputed_data.iloc[2499:2939]

# Split the data for regression
train_data_reg = reg_data.iloc[0:2058]
val_data_reg = reg_data.iloc[2058:2499]
test_data_reg = reg_data.iloc[2499:2939]

# Extract features and target variables

In [19]:
# Extract features and target variables
X_train_clas = train_data.drop(columns=['Condition'])
X_train_reg = train_data_reg.drop(columns=['Price'])
y_train_clas = train_data['Condition']
y_train_reg = train_data_reg['Price']

X_val_clas = val_data.drop(columns=['Condition'])
X_val_reg = val_data_reg.drop(columns=['Price'])
y_val_clas = val_data['Condition']
y_val_reg = val_data_reg['Price']

X_test_clas = test_data.drop(columns=['Condition'])
X_test_reg = test_data_reg.drop(columns=['Price'])
y_test_clas = test_data['Condition']
y_test_reg = test_data_reg['Price']

# Convert string labels to numerical indices for classification

In [20]:
# Convert string labels to numerical indices
class_map = {label: idx for idx, label in enumerate(np.unique(y_train_clas))}
y_train_clas_numeric = np.array([class_map[label] for label in y_train_clas])
y_val_clas_numeric = np.array([class_map[label] for label in y_val_clas])
y_test_clas_numeric = np.array([class_map[label] for label in y_test_clas])


# Decision tree training and prediction

In [21]:
# Train and evaluate Decision Tree Classifier with max_depth=5
dt_classifier = DecisionTree(max_depth=5)
dt_classifier.fit(X_train_clas.values, y_train_clas_numeric)
predictions = dt_classifier.predict(X_val_clas.values)
accuracy = np.mean(predictions == y_val_clas_numeric)
print(f'Validation Accuracy: {accuracy}')

Validation Accuracy: 0.7959183673469388


# Decision tree regressor training and prediction

In [22]:
# Train and evaluate Decision Tree Regressor with max_depth=5
dt_regressor = DecisionTreeRegressor(max_depth=5)

# Fit the Decision Tree Regressor to the training data
dt_regressor.fit(X_train_reg.values, y_train_reg.values)

# Make predictions on the validation data
predictions = dt_regressor.predict(X_val_reg.values)

# Calculate the mean squared error of the model
mse = np.mean((predictions - y_val_reg.values) ** 2)
print(f'Validation MSE: {mse}')

Validation MSE: 167506.4379521542


# Random forest training and prediction

In [None]:
# Train and evaluate Random Forest Classifier
rf_classifier = RandomForest(n_trees=100, max_depth=5)
rf_classifier.fit(X_train_clas.values, y_train_clas_numeric)
predictions = rf_classifier.predict(X_val_clas.values)
accuracy = np.mean(predictions == y_val_clas_numeric)
print(f'Validation Accuracy: {accuracy}')

# Random forest regressor training and prediction

In [None]:
# Train and evaluate Random Forest Regressor
rf_regressor = RandomForestRegressor(n_trees=100, max_depth=5)
rf_regressor.fit(X_train_reg.values, y_train_reg.values)
predictions = rf_regressor.predict(X_val_reg.values)
mse = np.mean((predictions - y_val_reg.values) ** 2)
print(f'Validation MSE: {mse}')