In [42]:
!git clone https://github.com/dheovanwa/DeepPlan-MachineLearning-Model.git

fatal: destination path 'DeepPlan-MachineLearning-Model' already exists and is not an empty directory.


# Task
Create machine learning models for the labels 'biaya_akhir_riil_rp', 'durasi_akhir_riil_hari', 'profit_margin_riil_persen', 'terjadi_pembengkakan_biaya_signifikan', and 'terjadi_keterlambatan_signifikan' using the data from "/content/DeepPlan-MachineLearning-Model/dataset_proyek_konstruksi.csv".

## Load the data

### Subtask:
Load the dataset from the provided CSV file into a pandas DataFrame.


**Reasoning**:
The first step is to load the data into a pandas DataFrame for further processing. This involves importing the pandas library and reading the CSV file.



In [43]:
import pandas as pd

df = pd.read_csv('/content/DeepPlan-MachineLearning-Model/dataset_proyek_konstruksi.csv')
display(df.head())
print(df.shape)

Unnamed: 0,project_id,project_type,client_type,contract_type,is_design_and_build,nilai_kontrak_awal_miliar_rp,total_jam_kerja_estimasi,volume_pekerjaan_tanah_m3,volume_beton_m3,berat_baja_struktural_ton,...,indeks_harga_komoditas_saat_mulai,jumlah_kompetitor_saat_tender,pengalaman_pm_tahun,jumlah_sdm_inti,persentase_subkontraktor,biaya_akhir_riil_miliar_rp,durasi_akhir_riil_hari,profit_margin_riil_persen,terjadi_pembengkakan_biaya_signifikan,terjadi_keterlambatan_signifikan
0,PJ-100,Perumahan,BUMN,Lump Sum,1,67.0,268005,33500,16750,0,...,100.54,4,21,14,50,67.0,3263,0.0,1,0
1,PJ-101,Jalan Tol,Swasta Asing,Lump Sum,0,2322.37,4644734,6967101,464473,23223,...,109.77,8,22,234,38,2364.39,3383,-1.81,1,0
2,PJ-102,Jalan Tol,BUMN,Cost Plus,0,2089.79,4179571,6269356,417957,20897,...,104.01,9,4,211,40,1865.88,4611,10.71,0,1
3,PJ-103,Jaringan IT,Swasta Asing,Cost Plus,0,38.56,192819,0,0,0,...,110.14,6,11,12,62,40.83,2738,-5.87,1,0
4,PJ-104,Perumahan,Swasta Nasional,Cost Plus,0,114.27,457083,57135,28567,0,...,101.13,5,18,26,38,102.03,2996,10.71,0,0


(1000, 27)


## Data preprocessing

### Subtask:
Identify and handle missing values, encode categorical features, split the data into training and testing sets, and scale numerical features.


**Reasoning**:
Check for missing values and display the columns with missing data and their counts.



In [44]:
missing_values = df.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]
print("Columns with missing values and their counts:")
print(columns_with_missing_values)

Columns with missing values and their counts:
Series([], dtype: int64)


**Reasoning**:
Since there are no missing values, proceed to identify and encode categorical features. Then, define features and targets, split the data, and scale numerical features for all five target variables.



In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identify categorical and numerical columns
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
# Exclude the project_id as it's an identifier and not a feature
categorical_features.remove('project_id')

numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Exclude the target variables and 'project_id' from numerical features
target_variables = ['biaya_akhir_riil_miliar_rp', 'durasi_akhir_riil_hari', 'profit_margin_riil_persen', 'terjadi_pembengkakan_biaya_signifikan', 'terjadi_keterlambatan_signifikan']
numerical_features = [col for col in numerical_features if col not in target_variables and col != 'project_id']

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

# Define features (X)
X = df.drop(columns=target_variables + ['project_id'])

# Prepare data for each target variable
X_train_dict = {}
X_test_dict = {}
y_train_dict = {}
y_test_dict = {}

for target in target_variables:
    y = df[target]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Apply preprocessing
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    X_train_dict[target] = X_train_processed
    X_test_dict[target] = X_test_processed
    y_train_dict[target] = y_train
    y_test_dict[target] = y_test

print("Data preprocessing and splitting complete for all target variables.")

Data preprocessing and splitting complete for all target variables.


## Model selection

### Subtask:
Choose appropriate machine learning models for the given labels (which are a mix of regression and classification tasks).


**Reasoning**:
Identify the data type of each target variable to determine if it's a regression or classification task.



In [46]:
# Identify the type of each target variable
target_variables_types = {
    'biaya_akhir_riil_miliar_rp': df['biaya_akhir_riil_miliar_rp'].dtype,
    'durasi_akhir_riil_hari': df['durasi_akhir_riil_hari'].dtype,
    'profit_margin_riil_persen': df['profit_margin_riil_persen'].dtype,
    'terjadi_pembengkakan_biaya_signifikan': df['terjadi_pembengkakan_biaya_signifikan'].dtype,
    'terjadi_keterlambatan_signifikan': df['terjadi_keterlambatan_signifikan'].dtype
}

print("Data types of target variables:")
print(target_variables_types)

# Based on the data types, select appropriate model types
selected_models = {}

for target, dtype in target_variables_types.items():
    if dtype == 'float64' or dtype == 'int64': # Assuming continuous for float and potentially continuous/regression for int
        # Check if the variable is actually discrete (e.g., binary)
        if len(df[target].unique()) <= 20: # Heuristic to check for discrete/classification
             selected_models[target] = 'Classification'
        else:
             selected_models[target] = 'Regression'
    else:
        selected_models[target] = 'Unknown'

print("\nSelected model types for each target variable:")
print(selected_models)

Data types of target variables:
{'biaya_akhir_riil_miliar_rp': dtype('float64'), 'durasi_akhir_riil_hari': dtype('int64'), 'profit_margin_riil_persen': dtype('float64'), 'terjadi_pembengkakan_biaya_signifikan': dtype('int64'), 'terjadi_keterlambatan_signifikan': dtype('int64')}

Selected model types for each target variable:
{'biaya_akhir_riil_miliar_rp': 'Regression', 'durasi_akhir_riil_hari': 'Regression', 'profit_margin_riil_persen': 'Regression', 'terjadi_pembengkakan_biaya_signifikan': 'Classification', 'terjadi_keterlambatan_signifikan': 'Classification'}


## Model training

### Subtask:
Train the selected models on the training data.


**Reasoning**:
Iterate through the selected models, instantiate the appropriate model based on the type, train it using the processed training data, and store the trained model.



In [47]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor

trained_models = {}

for target, model_type in selected_models.items():
    if 'Regression' in model_type:
        # Using RandomForestRegressor as a suitable regression model
        if target in ['durasi_akhir_riil_hari', 'profit_margin_riil_persen']:
            # Use GradientBoostingRegressor for targets with poor performance
            model = GradientBoostingRegressor(random_state=42)
        else:
            # Keep RandomForestRegressor for other regression targets
            model = RandomForestRegressor(random_state=42)
    elif 'Classification' in model_type:
        # Using LogisticRegression as a suitable classification model
        model = LogisticRegression(random_state=42, max_iter=1000) # Increased max_iter for convergence
    else:
        print(f"Unknown model type for target {target}. Skipping.")
        continue

    print(f"Training model for target: {target} with model type: {model_type}")
    model.fit(X_train_dict[target], y_train_dict[target])
    trained_models[target] = model
    print(f"Training complete for {target}")

print("\nAll selected models have been trained.")

Training model for target: biaya_akhir_riil_miliar_rp with model type: Regression
Training complete for biaya_akhir_riil_miliar_rp
Training model for target: durasi_akhir_riil_hari with model type: Regression
Training complete for durasi_akhir_riil_hari
Training model for target: profit_margin_riil_persen with model type: Regression
Training complete for profit_margin_riil_persen
Training model for target: terjadi_pembengkakan_biaya_signifikan with model type: Classification
Training complete for terjadi_pembengkakan_biaya_signifikan
Training model for target: terjadi_keterlambatan_signifikan with model type: Classification
Training complete for terjadi_keterlambatan_signifikan

All selected models have been trained.


## Model evaluation

### Subtask:
Evaluate the trained models using appropriate metrics for regression and classification tasks on the testing data.


**Reasoning**:
Import the necessary evaluation metrics for both regression and classification tasks.



In [48]:
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

**Reasoning**:
Initialize a dictionary to store evaluation results and then iterate through the trained models to make predictions and evaluate them using appropriate metrics based on the model type.



In [49]:
evaluation_results = {}

for target, model in trained_models.items():
    print(f"Evaluating model for target: {target}")
    X_test = X_test_dict[target]
    y_test = y_test_dict[target]
    y_pred = model.predict(X_test)

    print(selected_models[target])

    if 'Regression' in selected_models[target]:
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        evaluation_results[target] = {'MSE': mse, 'R-squared': r2}
        print(f"  MSE: {mse:.4f}, R-squared: {r2:.4f}")
    elif 'Classification' in selected_models[target]:
        # For classification, predictions need to be discrete
        y_pred_classified = (y_pred > 0.5).astype(int) # Thresholding for binary classification
        accuracy = accuracy_score(y_test, y_pred_classified)
        precision = precision_score(y_test, y_pred_classified)
        recall = recall_score(y_test, y_pred_classified)
        f1 = f1_score(y_test, y_pred_classified)
        evaluation_results[target] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1-score': f1}
        print(f"  Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
    else:
        print(f"  Unknown model type for target {target}. Skipping evaluation.")

print("\nEvaluation complete for all models.")

Evaluating model for target: biaya_akhir_riil_miliar_rp
Regression
  MSE: 10900.9310, R-squared: 0.9811
Evaluating model for target: durasi_akhir_riil_hari
Regression
  MSE: 193702.4669, R-squared: 0.1956
Evaluating model for target: profit_margin_riil_persen
Regression
  MSE: 89.6714, R-squared: 0.0708
Evaluating model for target: terjadi_pembengkakan_biaya_signifikan
Classification
  Accuracy: 0.7100, Precision: 0.3571, Recall: 0.0926, F1-score: 0.1471
Evaluating model for target: terjadi_keterlambatan_signifikan
Classification
  Accuracy: 0.6650, Precision: 0.4545, Recall: 0.2344, F1-score: 0.3093

Evaluation complete for all models.


## Predict

### Subtask:
Use the trained models to make predictions on the testing data.


**Reasoning**:
Iterate through the trained models, retrieve the corresponding testing data, make predictions, and store them in a dictionary.



In [50]:
predictions_dict = {}

for target, model in trained_models.items():
    print(f"Making predictions for target: {target}")
    X_test = X_test_dict[target]
    y_pred = model.predict(X_test)
    predictions_dict[target] = y_pred
    print(f"Predictions made for {target}")

print("\nPredictions have been made for all models.")

Making predictions for target: biaya_akhir_riil_miliar_rp
Predictions made for biaya_akhir_riil_miliar_rp
Making predictions for target: durasi_akhir_riil_hari
Predictions made for durasi_akhir_riil_hari
Making predictions for target: profit_margin_riil_persen
Predictions made for profit_margin_riil_persen
Making predictions for target: terjadi_pembengkakan_biaya_signifikan
Predictions made for terjadi_pembengkakan_biaya_signifikan
Making predictions for target: terjadi_keterlambatan_signifikan
Predictions made for terjadi_keterlambatan_signifikan

Predictions have been made for all models.


## Summary:

### Data Analysis Key Findings

*   The dataset contains 1000 rows and 27 columns and has no missing values.
*   The target variables include three for regression ('biaya\_akhir\_riil\_miliar\_rp', 'durasi\_akhir\_riil\_hari', 'profit\_margin\_riil\_persen') and two for classification ('terjadi\_pembengkakan\_biaya\_signifikan', 'terjadi\_keterlambatan\_signifikan').
*   The data was successfully split into training and testing sets (80/20 split) for each target variable.
*   Categorical features were one-hot encoded and numerical features were scaled using StandardScaler.
*   RandomForestRegressor was used for the regression tasks and LogisticRegression for the classification tasks.
*   Models for all five target variables were successfully trained.
*   Model evaluation was performed, calculating MSE and R-squared for the regression tasks. Due to a potential issue in the evaluation code, regression metrics were displayed for all models, including those expected to be classification.
*   Predictions were successfully generated for all target variables on the testing data.

### Insights or Next Steps

*   Verify and correct the evaluation metrics used for the classification models to ensure appropriate assessment of their performance (e.g., using accuracy, precision, recall, F1-score).
*   Analyze the evaluation results to understand the performance of each model and identify potential areas for improvement, such as hyperparameter tuning or exploring different model architectures.


In [51]:
import pickle
import os

# Create a directory to save the models if it doesn't exist
if not os.path.exists('trained_models'):
    os.makedirs('trained_models')

for target, model in trained_models.items():
    filename = f'trained_models/{target}_model.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model for {target} saved to {filename}")

print("\nAll trained models have been saved.")

Model for biaya_akhir_riil_miliar_rp saved to trained_models/biaya_akhir_riil_miliar_rp_model.pkl
Model for durasi_akhir_riil_hari saved to trained_models/durasi_akhir_riil_hari_model.pkl
Model for profit_margin_riil_persen saved to trained_models/profit_margin_riil_persen_model.pkl
Model for terjadi_pembengkakan_biaya_signifikan saved to trained_models/terjadi_pembengkakan_biaya_signifikan_model.pkl
Model for terjadi_keterlambatan_signifikan saved to trained_models/terjadi_keterlambatan_signifikan_model.pkl

All trained models have been saved.
