In [None]:

import numpy as np
import scipy.stats as stats
import seaborn as sns
import pandas as pd
import warnings

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors, metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization
from sklearn.utils import resample



In [None]:
train = pd.read_csv(r"\UAIS\datathon\CAW\train.csv")
test = pd.read_csv(r"\UAIS\datathon\CAW\test.csv")
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
train.head()

In [None]:
train['size_class'] = train['size_class'] - 1

In [None]:
def replace_values(val):
    stripped_val = str(val).strip()
    if stripped_val == "Surface":
        return "Surface"
    elif stripped_val == "":
        return np.nan
    else:
        return val


train['fire_type'] = train['fire_type'].apply(replace_values)
test['fire_type'] = test['fire_type'].apply(replace_values)

In [None]:
train['Letter_fuel'] = train['fuel_type'].str[0]
test['Letter_fuel'] = test['fuel_type'].str[0]

train['temperature'] = train['temperature'] + 33
test['temperature'] = test['temperature'] + 33


In [None]:
def extract_region(fire_number):

    forest_areas = {
        'C': 'Calgary',
        'E': 'Edson',
        'H': 'High Level',
        'G': 'Grande Prairie',
        'L': 'Lac La Biche',
        'M': 'Fort McMurray',
        'P': 'Peace River',
        'R': 'Rocky',
        'S': 'Slave Lake',
        'W': 'Whitecourt'
    }
    

    region = forest_areas.get(fire_number[0], "Unknown")
    
    return region

train['region'] = train['fire_number'].apply(extract_region)
test['region'] = test['fire_number'].apply(extract_region)


In [None]:
train['has_name'] = train['fire_name'].notnull().astype(int)
test['has_name'] = test['fire_name'].notnull().astype(int)

In [None]:
train['bucketing_on_fire'] = train['bucketing_on_fire'].str.lower()
test['bucketing_on_fire'] = test['bucketing_on_fire'].str.lower()

In [None]:

min_reasonable_date = pd.Timestamp('2000-01-01')
max_reasonable_date = pd.Timestamp('2023-12-31')


date_features = ['fire_start_date', 'discovered_date', 'reported_date', 'dispatch_date','start_for_fire_date', 'assessment_datetime', 'ia_arrival_at_fire_date', 'fire_fighting_start_date', 'first_bucket_drop_date', 'ex_fs_date']

for feature in date_features:

    train[feature] = pd.to_datetime(train[feature], errors='coerce')
    test[feature] = pd.to_datetime(test[feature], errors='coerce')


    train.loc[(train[feature] < min_reasonable_date) | (train[feature] > max_reasonable_date), feature] = pd.NaT
    test.loc[(test[feature] < min_reasonable_date) | (test[feature] > max_reasonable_date), feature] = pd.NaT


In [None]:
weather_mapping = {
    'Clear': 0,
    'CB dry': 1,
    'Cloudy': 2,
    'CB wet': 3,
    'Rain showers': 4
}

train['weather_conditions_encoded'] = train['weather_conditions_over_fire'].map(weather_mapping)
test['weather_conditions_encoded'] = test['weather_conditions_over_fire'].map(weather_mapping)

In [None]:
drop_list = ["fire_name", "fire_number", "discovered_size", "industry_identifier_desc", "weather_conditions_over_fire"]

train = train.drop(columns=drop_list)
test = test.drop(columns=drop_list)


In [None]:
def generate_time_features(df): 

    def timedelta_to_minutes(td):
        return td.total_seconds() / 60 if pd.notna(td) else np.nan


    def compute_time_difference(start, end):
        if pd.isna(start) or pd.isna(end):
            return np.nan
        return timedelta_to_minutes(end - start)

    df['time_to_discovery'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['discovered_date']), axis=1)
    df['time_to_report'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['reported_date']), axis=1)
    df['time_to_dispatch'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['dispatch_date']), axis=1)
    df['time_to_start_for_fire'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['start_for_fire_date']), axis=1)
    df['time_to_assessment'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['assessment_datetime']), axis=1)
    df['time_to_ia_arrival'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['ia_arrival_at_fire_date']), axis=1)
    df['time_to_start_fighting'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['fire_fighting_start_date']), axis=1)
    df['time_to_first_bucket_drop'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['first_bucket_drop_date']), axis=1)
    df['total_time_to_extinguish'] = df.apply(lambda row: compute_time_difference(row['fire_start_date'], row['ex_fs_date']), axis=1)
    
    return df

train = generate_time_features(train)
test = generate_time_features(test)

In [None]:
drop_datelist = ['fire_start_date', 'discovered_date', 'reported_date', 'dispatch_date','start_for_fire_date', 'assessment_datetime', 'ia_arrival_at_fire_date', 'fire_fighting_start_date', 'first_bucket_drop_date', 'ex_fs_date']

train = train.drop(columns=drop_datelist)
test = test.drop(columns=drop_datelist)

In [None]:
train_feature_list = list(train.columns)
test_feature_list = list(test.columns)
label_size_class = LabelEncoder()

for feature in train_feature_list:
 
    if train[feature].dtype == 'object':
        not_null_mask = train[feature].notnull() 
        train.loc[not_null_mask, feature] = label_size_class.fit_transform(train.loc[not_null_mask, feature])

for feature in test_feature_list:

    if test[feature].dtype == 'object':
        not_null_mask = test[feature].notnull()
        test.loc[not_null_mask, feature] = label_size_class.fit_transform(test.loc[not_null_mask, feature])

In [None]:
train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)

In [None]:

def get_gmm_class_feature(feat, n):
    gmm = GaussianMixture(n_components=n, random_state=42)
    gmm.fit(train[feat].values.reshape(-1, 1))
    train[f'{feat}_class'] = gmm.predict(train[feat].values.reshape(-1, 1))
    test[f'{feat}_class'] = gmm.predict(test[feat].values.reshape(-1, 1))

get_gmm_class_feature("fire_location_latitude", 2)
get_gmm_class_feature("fire_location_longitude", 2)
get_gmm_class_feature("assessment_hectares", 6)
get_gmm_class_feature("fire_spread_rate", 8)
get_gmm_class_feature("temperature", 10)
get_gmm_class_feature("relative_humidity", 15)
get_gmm_class_feature("wind_speed", 5)
get_gmm_class_feature("fire_fighting_start_size", 19)
get_gmm_class_feature("distance_from_water_source", 19)
get_gmm_class_feature("time_to_discovery", 6)
get_gmm_class_feature("time_to_report", 19)
get_gmm_class_feature("time_to_dispatch", 7)
get_gmm_class_feature("time_to_start_for_fire", 13)
get_gmm_class_feature("time_to_assessment", 2)
get_gmm_class_feature("time_to_ia_arrival", 7)
get_gmm_class_feature("time_to_start_fighting", 6)
get_gmm_class_feature("time_to_first_bucket_drop", 10)
get_gmm_class_feature("total_time_to_extinguish", 17)



def get_gmm_class_feature(feat, n):
    gmm = GaussianMixture(n_components=n, random_state=42)
    gmm.fit(train[feat].values.reshape(-1, 1))
    train[f'{feat}_class'] = gmm.predict(train[feat].values.reshape(-1, 1))
    test[f'{feat}_class'] = gmm.predict(test[feat].values.reshape(-1, 1))

def evaluate_classifier():
    model1 = CatBoostClassifier(verbose=0)
    model1.fit(train.drop(columns='size_class'), train['size_class'])
    y_pred1 = model1.predict(test.drop(columns='size_class'))
    return balanced_accuracy_score(test['size_class'], y_pred1)

features_to_optimize = [
    "fire_location_latitude",
    "fire_location_longitude",
    "assessment_hectares",
    "fire_spread_rate",
    "temperature",
    "relative_humidity",
    "wind_speed",
    "fire_fighting_start_size",
    "distance_from_water_source",
    "time_to_discovery",
    "time_to_report",
    "time_to_dispatch",
    "time_to_start_for_fire",
    "time_to_assessment",
    "time_to_ia_arrival",
    "time_to_start_fighting",
    "time_to_first_bucket_drop",
    "total_time_to_extinguish"
]

optimal_components = {}

for feat in features_to_optimize:
    best_score = -np.inf
    best_components = None
    
    for n in range(2, 21):
        get_gmm_class_feature(feat, n)
        score = evaluate_classifier()
        
        if score > best_score:
            best_score = score
            best_components = n
    
    optimal_components[feat] = best_components
    print(f"Optimal components for {feat}: {best_components}, Balanced Accuracy: {best_score:.4f}")

print("All optimal components:", optimal_components)


In [None]:
cols_to_drop = ["fire_id"]

X = train.drop(cols_to_drop + ["size_class"], axis=1)
y = train["size_class"]

X_train_original, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = pd.concat([X_train_original, y_train], axis=1)

max_size = train_data["size_class"].value_counts().max()

lst = [train_data]
for class_index, group in train_data.groupby("size_class"):
    if len(group) < max_size:
        lst.append(resample(group, replace=True, n_samples=max_size-len(group)))

train_data_oversampled = pd.concat(lst)
train_data_oversampled = train_data_oversampled.sample(frac=1).reset_index(drop=True)

X_train_oversampled = train_data_oversampled.drop("size_class", axis=1)
y_train = train_data_oversampled["size_class"]

sc = StandardScaler()
X_train = sc.fit_transform(X_train_oversampled)
X_test = sc.transform(X_test)


In [None]:
def optimize_model(model_eval, pbounds, init_points=5, n_iter=10):
    optimizer = BayesianOptimization(f=model_eval, pbounds=pbounds, random_state=42)
    optimizer.maximize(init_points=init_points, n_iter=n_iter)
    best_params = optimizer.max['params']
    for key, val in best_params.items():
        if isinstance(val, float) and val.is_integer():
            best_params[key] = int(val)
    return best_params


In [None]:
def catboost_eval(learning_rate, depth, l2_leaf_reg):
    params = {
        'learning_rate': learning_rate,
        'depth': int(depth),
        'l2_leaf_reg': l2_leaf_reg,
        'verbose': 0
    }
    clf = CatBoostClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = balanced_accuracy_score(y_test, y_pred)
    return score

bounds_catboost = {
    'learning_rate': (0.01, 0.3),
    'depth': (4, 8),
    'l2_leaf_reg': (1, 5),
}
best_params_catboost = optimize_model(catboost_eval, bounds_catboost)
best_params_catboost['depth'] = int(best_params_catboost['depth'])
model1 = CatBoostClassifier(**best_params_catboost)
model1.fit(X_train, y_train)


In [None]:

def xgb_eval(learning_rate, max_depth, gamma, colsample_bytree):
    params = {
        'learning_rate': learning_rate,
        'max_depth': int(max_depth),
        'gamma': gamma,
        'colsample_bytree': colsample_bytree
    }
    clf = XGBClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = balanced_accuracy_score(y_test, y_pred)
    return score

bounds_xgboost = {
    'learning_rate': (0.01, 0.3),
    'max_depth': (3, 10),
    'gamma': (0, 1),
    'colsample_bytree': (0.5, 1)
}
best_params_xgboost = optimize_model(xgb_eval, bounds_xgboost)
best_params_xgboost['max_depth'] = int(best_params_xgboost['max_depth'])

model2 = XGBClassifier(**best_params_xgboost)
model2.fit(X_train, y_train)


In [None]:
def rf_eval(n_estimators, max_depth, min_samples_split):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'min_samples_split': int(min_samples_split),
    }
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = balanced_accuracy_score(y_test, y_pred)
    return score

bounds_rf = {
    'n_estimators': (10, 200),
    'max_depth': (5, 40),
    'min_samples_split': (2, 20)
}

best_params_rf = optimize_model(rf_eval, bounds_rf)
best_params_rf['n_estimators'] = int(best_params_rf['n_estimators'])
best_params_rf['max_depth'] = int(best_params_rf['max_depth'])
best_params_rf['min_samples_split'] = int(best_params_rf['min_samples_split'])

model3 = RandomForestClassifier(**best_params_rf)
model3.fit(X_train, y_train)


In [None]:

def lgbm_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
    params = {
        'num_leaves': int(num_leaves),
        'feature_fraction': feature_fraction,
        'bagging_fraction': bagging_fraction,
        'max_depth': int(max_depth),
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2,
        'min_split_gain': min_split_gain,
        'min_child_weight': min_child_weight,
    }
    clf = LGBMClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = balanced_accuracy_score(y_test, y_pred)
    return score

bounds_lgbm = {
    'num_leaves': (31, 500),
    'feature_fraction': (0.1, 0.9),
    'bagging_fraction': (0.8, 1),
    'max_depth': (5, 8.99),
    'lambda_l1': (0, 5),
    'lambda_l2': (0, 3),
    'min_split_gain': (0.001, 0.1),
    'min_child_weight': (5, 50),
}
best_params_lgbm = optimize_model(lgbm_eval, bounds_lgbm)
best_params_lgbm['max_depth'] = int(best_params_lgbm['max_depth'])
best_params_lgbm['num_leaves'] = int(best_params_lgbm['num_leaves'])

model4 = LGBMClassifier(**best_params_lgbm)
model4.fit(X_train, y_train)


In [None]:
y_pred1 = model1.predict(X_test)
print("CatBoost Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred1))

y_pred2 = model2.predict(X_test)
print("XGBoost Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred2))

y_pred3 = model3.predict(X_test)
print("RandomForest Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred3))


y_pred4 = model4.predict(X_test)
print("LightGBM Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred4))

weights = [0.25, 0.25, 0.0, 0.5] 

probs1 = model1.predict_proba(X_test)
probs2 = model2.predict_proba(X_test)
probs3 = model3.predict_proba(X_test)
probs4 = model4.predict_proba(X_test)
    
final_probs = (weights[0] * probs1 + weights[1] * probs2 + 
               weights[2] * probs3 + weights[3] * probs4)

final_probs[:, 3] += 0.0 * final_probs[:, 2]
final_probs[:, 2] -= 0.0 * final_probs[:, 2]


final_probs /= final_probs.sum(axis=1, keepdims=True)


final_pred = np.argmax(final_probs, axis=1)

ensemble_balanced_accuracy = balanced_accuracy_score(y_test, final_pred)
print("Ensemble Balanced Accuracy with Weighted Soft Voting after adjustment:", ensemble_balanced_accuracy)

In [None]:
#y_test_1d = np.ravel(y_test)
#final_pred_1d = np.ravel(final_pred) 

#mismatches = y_test_1d != final_pred_1d

#mismatched_probs = final_probs[mismatches]

#mismatched_data = pd.DataFrame({
#    'fire_id': test.loc[mismatches, 'fire_id'],
#    'Actual_size_class': y_test_1d[mismatches],
#    'Predicted': final_pred_1d[mismatches],
#    'Prob_Class_0': mismatched_probs[:, 0],
#    'Prob_Class_1': mismatched_probs[:, 1],
#    'Prob_Class_2': mismatched_probs[:, 2],
#    'Prob_Class_3': mismatched_probs[:, 3],
#    'Prob_Class_4': mismatched_probs[:, 4]
#})


#mismatched_data.to_csv('mismatched_data_with_probs.csv', index=False)


In [None]:
print(classification_report(y_test, final_pred))
print(confusion_matrix(y_test, final_pred))
ensemble_accuracy = accuracy_score(y_test, final_pred)
print(ensemble_accuracy)

In [None]:
probabilities = model1.predict_proba(X_test)
print(probabilities)
df_probabilities = pd.DataFrame(probabilities)
df_probabilities.to_csv('probabilities.csv', index=False)

In [None]:
X_test_real = test.drop(cols_to_drop, axis=1)
X_test_real_transformed = sc.transform(X_test_real)

y_pred1_real = model1.predict(X_test_real_transformed)
y_pred2_real = model2.predict(X_test_real_transformed)
y_pred3_real = model3.predict(X_test_real_transformed)
y_pred4_real = model4.predict(X_test_real_transformed)

weights = [0.25, 0.25, 0.0, 0.5] 

probs1_real = model1.predict_proba(X_test_real_transformed)
probs2_real = model2.predict_proba(X_test_real_transformed)
probs3_real = model3.predict_proba(X_test_real_transformed)
probs4_real = model4.predict_proba(X_test_real_transformed)
    
final_probs_real = (weights[0] * probs1_real + weights[1] * probs2_real + 
               weights[2] * probs3_real + weights[3] * probs4_real)

final_probs_real[:, 3] += 0.0 * final_probs_real[:, 2]
final_probs_real[:, 2] -= 0.0 * final_probs_real[:, 2]


final_probs_real /= final_probs_real.sum(axis=1, keepdims=True)


final_pred_real = np.argmax(final_probs_real, axis=1)

In [None]:
#rfc.feature_importances_

In [None]:
my_submission = pd.DataFrame({'fire_id': test.index, 'size_class': final_pred_real+1})
my_submission.to_csv('submission.csv', index=False)