# Final Case Study
Predict the critical temperature for materials based on extracted features

# Imports

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import heapq
import joblib
import random
import xgboost as xgb
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, root_mean_squared_error, mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, root_mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from datetime import datetime

# Dataset

In [29]:
df_train = pd.read_csv('./train.csv')

In [30]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21263 entries, 0 to 21262
Data columns (total 82 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   number_of_elements               21263 non-null  int64  
 1   mean_atomic_mass                 21263 non-null  float64
 2   wtd_mean_atomic_mass             21263 non-null  float64
 3   gmean_atomic_mass                21263 non-null  float64
 4   wtd_gmean_atomic_mass            21263 non-null  float64
 5   entropy_atomic_mass              21263 non-null  float64
 6   wtd_entropy_atomic_mass          21263 non-null  float64
 7   range_atomic_mass                21263 non-null  float64
 8   wtd_range_atomic_mass            21263 non-null  float64
 9   std_atomic_mass                  21263 non-null  float64
 10  wtd_std_atomic_mass              21263 non-null  float64
 11  mean_fie                         21263 non-null  float64
 12  wtd_mean_fie      

In [31]:
X = df_train.drop(columns=['critical_temp'])
y = df_train['critical_temp']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model

In [32]:
model = xgb.XGBRegressor(device="cuda", random_state=42)
model_GSCV = xgb.XGBRegressor(device="cuda", random_state=42)
model.fit(X_train, y_train)

In [None]:
xgb.plot_importance(model, importance_type='weight', title='Feat. Import. - Weight')

In [None]:
xgb.plot_importance(model, importance_type='gain', title='Feat. Import. - Gain')

In [33]:
param_grid = {
    'n_estimators': [50, 100, 200],           # Number of boosting rounds (trees)
    'learning_rate': [0.01, 0.1, 0.2],        # Step size at each iteration while moving towards a minimum of a loss function
    'max_depth': [3, 5, 7, 9],                    # Maximum depth of a tree
    'subsample': [0.8, 0.9, 1.0],              # Fraction of samples used for fitting trees
    'colsample_bytree': [0.8, 0.9, 1.0],      # Fraction of features used for each tree
    'gamma': [0, 0.1, 0.2]                    # Minimum loss reduction required to make a further partition
}

In [35]:
grid_search = GridSearchCV(estimator=model_GSCV, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 972 candidates, totalling 4860 fits


In [None]:
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Evaluate the model with the best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Test Mean Squared Error: {mse:.4f}")

# Predictions

In [21]:
pred = model.predict(X_test)
rmse = root_mean_squared_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
meanae = mean_absolute_error(y_test, pred)
medae = median_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)

In [22]:
print(f"Root Mean Squared Error (RMSE): {rmse:.5f}")
print(f"Mean Squared Error (MSE): {mse:.5f}")
print(f"Mean Absolute Error (MAE): {meanae:.5f}")
print(f"Median Absolute Error (MAE): {medae:.5f}")
print(f"R^2 Error (R^2): {r2:.5f}")

Root Mean Squared Error (RMSE): 9.30605
Mean Squared Error (MSE): 86.60260
Mean Absolute Error (MAE): 5.57911
Median Absolute Error (MAE): 2.84568
R^2 Error (R^2): 0.92476


# Random Numbers

In [23]:
rdm_mdls = []
rdm_combis = []
rdm_nmbrs = []
rdm_stats = pd.DataFrame(columns=['Random Split', 'Random Model', 'Mean_Abs', 'Med_Abs', 'R2'])

# DO NOT LET THE RANGE GO HIGH, REMEMBER n!, n over 150 is undefined because it is too big, limit to at most 100
# 
for i in range(100):
    ran_int = random.randint(1, 10000)
    rdm_nmbrs.append(ran_int)

for combi in itertools.combinations(rdm_nmbrs, 2):
    rdm_combis.append(list(combi))

In [24]:
X_rdm = df_train.copy().drop(columns=['critical_temp'])
y_rdm = df_train['critical_temp']

for i in range(0, len(rdm_combis)):
    combi_model = xgb.XGBRFRegressor(device="cuda", random_state=rdm_combis[i][1])
    X_tmp_trn, X_tmp_tst, y_tmp_trn, y_tmp_tst = train_test_split(X_rdm, y_rdm, test_size=0.2, random_state=rdm_combis[i][0])

    combi_model.fit(X_tmp_trn, y_tmp_trn)
    rdm_mdls.append(combi_model)
    pred = combi_model.predict(X_tmp_tst)
    mean_er = mean_absolute_error(y_tmp_tst, pred)
    med_er = median_absolute_error(y_tmp_tst, pred)
    r2_er = r2_score(y_tmp_tst, pred)
    new_data = pd.DataFrame([[rdm_combis[i][0], rdm_combis[i][1], mean_er, med_er, r2_er]], columns=['Random Split', 'Random Model', 'Mean_Abs', 'Med_Abs', 'R2'])
    rdm_stats = pd.concat([rdm_stats, new_data], ignore_index=True)

  rdm_stats = pd.concat([rdm_stats, new_data], ignore_index=True)


In [25]:
rdm_combi_sort_R2 = rdm_stats.sort_values(by='R2', ascending=False)

In [26]:
rdm_combi_sort_R2.head(5)

Unnamed: 0,Random Split,Random Model,Mean_Abs,Med_Abs,R2
4606,9575,7986,9.426405,5.636288,0.847745
4615,9575,4871,9.42108,5.661852,0.847412
4620,9575,6692,9.420504,5.642004,0.847178
3855,6219,5008,9.151491,5.488518,0.847108
4623,9575,1373,9.439789,5.600971,0.847085


In [27]:
rdm_combi_sort_R2.tail(5)

Unnamed: 0,Random Split,Random Model,Mean_Abs,Med_Abs,R2
2310,2905,686,9.736113,5.546232,0.815016
2250,2905,5266,9.705681,5.655167,0.814916
2308,2905,5008,9.727792,5.620316,0.814323
2301,2905,9925,9.741244,5.609824,0.81398
2284,2905,463,9.7177,5.524295,0.813896
