In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
import optuna

In [27]:
from imblearn.over_sampling import SMOTE

In [28]:
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Integer

In [29]:
# Load data
train_data = pd.read_csv('train_AIC.csv')
test_data = pd.read_csv('test_AIC.csv')

# Data preprocessing
exclude_columns = ['Месяц3', 'Количество позиций']
X = train_data.drop(columns=exclude_columns + ['y'])
y = train_data['y']

In [30]:
y.value_counts()

0    192663
1     32337
Name: y, dtype: int64

In [31]:
sm = SMOTE(random_state=42, k_neighbors=5)
X_res, y_res = sm.fit_resample(X, y)

In [32]:
y_res.value_counts()

1    192663
0    192663
Name: y, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [34]:
# Define the objective function to optimize with Optuna
def objective(trial):
    # Define the hyperparameter search space
    param_space = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.5),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_uniform('gamma', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
    }
    
    # Create an XGBoost classifier with the suggested hyperparameters
    xgb_model = XGBClassifier(random_state=42, **param_space)
    
    # Fit the model on the training data
    xgb_model.fit(X_train, y_train)
    
    # Make predictions on the test data
    test_predictions = xgb_model.predict(X_test)
    
    # Calculate the F1 score as the objective to maximize
    f1_macro = f1_score(y_test, test_predictions, average='macro')
    
    return f1_macro

In [37]:
param_grid = {'n_estimators': Integer(50,500),
              'max_depth': Integer(1,20)}

In [38]:
# Create and save a submission file using the best hyperparameters
clf = XGBClassifier(random_state=42)
# Our cross-validation strategy (it could be just an int)
cv = StratifiedKFold(n_splits=3, shuffle=True)

# The main class from sklearn-genetic-opt
evolved_estimator = GASearchCV(estimator=clf,
                              cv=cv,
                              scoring='f1_macro',
                              param_grid=param_grid,
                              n_jobs=-1,
                              verbose=True,
                              population_size=10,
                              generations=30)

In [39]:
clf.fit(X_res, y_res)

In [40]:
evolved_estimator.fit(X_train, y_train)

gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	10    	0.92144	0.0254382  	0.941857   	0.863513   
1  	20    	0.940332	0.00214665 	0.942568   	0.936795   
2  	20    	0.942101	0.000425093	0.942568   	0.941469   
3  	20    	0.942198	0.0004086  	0.942568   	0.941287   
4  	20    	0.942422	0.000263581	0.942671   	0.941806   
5  	20    	0.942513	0.000224511	0.942718   	0.941904   
6  	20    	0.942691	0.000195573	0.943041   	0.942269   
7  	20    	0.942564	0.00046273 	0.943041   	0.941708   
8  	20    	0.94253 	0.000328533	0.943041   	0.941909   
9  	20    	0.942456	0.000277111	0.942784   	0.941913   
10 	20    	0.94249 	0.000208668	0.942784   	0.942049   
11 	20    	0.942183	0.000432904	0.942784   	0.941394   
12 	20    	0.941981	0.000243268	0.942161   	0.941394   
13 	20    	0.9422  	0.000226607	0.942508   	0.941661   
14 	20    	0.942155	0.000667066	0.942545   	0.940206   
15 	20    	0.94196 	0.000704031	0.942508   	0.940206   
16 	20    	0.942284	0.00048441 	0.942727   	0.9411



29 	20    	0.942524	0.000419371	0.942961   	0.941829   
30 	20    	0.942485	0.000390484	0.942961   	0.941867   


In [41]:
f1_macro = evolved_estimator.predict(X_test)
print("Best F1-Score (Test):", f1_score(f1_macro, y_test, average='macro'))

Best F1-Score (Test): 0.8875612771851846


In [45]:
test_data = test_data.drop(columns=exclude_columns)
y_predict_ga = evolved_estimator.predict(test_data)

In [46]:
submission_df = pd.DataFrame({'id': range(len(y_predict_ga)), 'value': y_predict_ga})  # No 'ID' column
submission_df.to_csv('submission5.csv', index=False)