In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset into a pandas DataFrame (assuming you've already preprocessed it)
df = pd.read_excel('D:/Summer2023/APT3025A/autodataset.xlsx')

# Determine irrelevant columns and drop them
irrelevant_columns = ['dateCrawled', 'name', 'seller', 'offerType', 'abtest', 'monthOfRegistration',
                      'dateCreated', 'nrOfPictures', 'lastSeen']
df = df.drop(columns=irrelevant_columns)

# Extract the features and target variable
X = df.drop(columns=['price'])  # Features (all columns except 'price')
y = df['price']  # Target variable ('price')

# Convert the target variable to classes for classification
# Here, we'll create three categories: low, medium, and high price
price_thresholds = [5000, 15000, 30000]
y_class = pd.cut(y, bins=[-float('inf')] + price_thresholds + [float('inf')], labels=['low', 'medium', 'high'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=42)

# Model 1: Decision Tree Regressor (for classification)
reg_tree_model = DecisionTreeRegressor()
reg_tree_model.fit(X_train, y_train)

# Model 2: Random Forest Classifier
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

# Model 3: Gradient Boosting Regressor (for classification)
gradient_boosted_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gradient_boosted_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_reg_tree = reg_tree_model.predict(X_test)
y_pred_random_forest = random_forest_model.predict(X_test)
y_pred_gradient_boosted = gradient_boosted_model.predict(X_test)

# Model evaluation (for classification, not regression)
def evaluate_classification(y_true, y_pred, model_name):
    accuracy = (y_true == y_pred).mean()
    print(f"{model_name} - Accuracy: {accuracy}")

print("Evaluation Results (Classification):")
evaluate_classification(y_test, y_pred_reg_tree, "Decision Tree")
evaluate_classification(y_test, y_pred_random_forest, "Random Forest")
evaluate_classification(y_test, y_pred_gradient_boosted, "Gradient Boosted")

# Now, let's predict the price category for a new sample
new_sample = pd.DataFrame({
    'vehicleType': ['sedan'],
    'yearOfRegistration': [2010],
    'gearbox': ['manual'],
    'powerPS': [150],
    'model': ['Audi A4'],
    'odometer': [100000],
    'fuelType': ['petrol'],
    'brand': ['Audi'],
    'notRepairedDamage': ['no'],
    'postalCode': [12345]
})

predicted_price_category = random_forest_model.predict(new_sample)
print("Predicted Price Category for the New Sample:", predicted_price_category[0])
