In [30]:
import pandas as pd
import numpy as np

In [31]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline

In [32]:
train = pd.read_csv('train.csv')

In [33]:
train.shape

(104000, 13)

In [34]:
train.sample(3)

Unnamed: 0,id,machine_type,rot_speed_rpm,drive_moment,tool_usage_minutes,failure_flag,tool_wear_flag,heat_dissipation_flag,power_failure_flag,overstrain_flag,resonance_condition_flag,air_temperature,process_temperature
89903,40892,M,1401,60.01,149,0,0,0,0,0,0,27.26,36.45
45936,113422,M,1486,46.05,9,0,0,0,0,0,0,26.224,35.65
94235,37707,L,1446,39.08,109,0,0,0,0,0,0,25.002,35.35


In [35]:
train.isna().sum()

id                          0
machine_type                0
rot_speed_rpm               0
drive_moment                0
tool_usage_minutes          0
failure_flag                0
tool_wear_flag              0
heat_dissipation_flag       0
power_failure_flag          0
overstrain_flag             0
resonance_condition_flag    0
air_temperature             0
process_temperature         0
dtype: int64

In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104000 entries, 0 to 103999
Data columns (total 13 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        104000 non-null  int64  
 1   machine_type              104000 non-null  object 
 2   rot_speed_rpm             104000 non-null  int64  
 3   drive_moment              104000 non-null  float64
 4   tool_usage_minutes        104000 non-null  int64  
 5   failure_flag              104000 non-null  int64  
 6   tool_wear_flag            104000 non-null  int64  
 7   heat_dissipation_flag     104000 non-null  int64  
 8   power_failure_flag        104000 non-null  int64  
 9   overstrain_flag           104000 non-null  int64  
 10  resonance_condition_flag  104000 non-null  int64  
 11  air_temperature           104000 non-null  float64
 12  process_temperature       104000 non-null  float64
dtypes: float64(3), int64(9), object(1)
memory us

In [37]:
train.describe()

Unnamed: 0,id,rot_speed_rpm,drive_moment,tool_usage_minutes,failure_flag,tool_wear_flag,heat_dissipation_flag,power_failure_flag,overstrain_flag,resonance_condition_flag,air_temperature,process_temperature
count,104000.0,104000.0,104000.0,104000.0,104000.0,104000.0,104000.0,104000.0,104000.0,104000.0,104000.0,104000.0
mean,65091.603865,1503.875875,42.680517,118.181933,0.231894,0.055471,0.080808,0.058385,0.064442,0.048875,26.890451,36.798837
std,37539.426139,194.752195,11.471648,69.803328,0.422044,0.228899,0.272541,0.23447,0.24554,0.215608,1.868379,1.315479
min,2.0,1183.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.15,32.65
25%,32592.75,1389.0,35.91,59.0,0.0,0.0,0.0,0.0,0.0,0.0,25.228,35.65
50%,65123.5,1466.0,43.1,122.0,0.0,0.0,0.0,0.0,0.0,0.0,27.067,37.05
75%,97627.25,1577.0,50.65,183.0,0.0,0.0,0.0,0.0,0.0,0.0,28.117,37.65
max,130000.0,2595.0,76.6,246.0,1.0,1.0,1.0,1.0,1.0,1.0,31.25,40.65


In [38]:
test = pd.read_csv('test.csv')

In [39]:
test_ids = test['id'].copy()

In [40]:
encoder = LabelEncoder()
train['machine_type'] = encoder.fit_transform(train['machine_type'])
test['machine_type'] = encoder.fit_transform(test['machine_type'])

In [41]:
X = train.drop(columns=["id", "failure_flag"])
y = train["failure_flag"]
X_test = test.drop(columns=["id"])

In [42]:
X["temp_diff"] = X["process_temperature"] - X["air_temperature"]
X_test["temp_diff"] = X_test["process_temperature"] - X_test["air_temperature"]

In [43]:
X["log_tool_usage"] = np.log1p(X["tool_usage_minutes"])
X_test["log_tool_usage"] = np.log1p(X_test["tool_usage_minutes"])

X["log_drive_moment"] = np.log1p(X["drive_moment"])
X_test["log_drive_moment"] = np.log1p(X_test["drive_moment"])

X["overheat_risk"] = X["heat_dissipation_flag"] & X["power_failure_flag"]
X_test["overheat_risk"] = X_test["heat_dissipation_flag"] & X_test["power_failure_flag"]

X["stress_risk"] = X["overstrain_flag"] & X["resonance_condition_flag"]
X_test["stress_risk"] = X_test["overstrain_flag"] & X_test["resonance_condition_flag"]

In [44]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [45]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [54]:
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(n_estimators=1000, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42)
}

In [55]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred)
    print(f"{name} F1-score: {score:.4f}")

RandomForest F1-score: 0.9997
XGBoost F1-score: 0.9998
[LightGBM] [Info] Number of positive: 19294, number of negative: 63906
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1864
[LightGBM] [Info] Number of data points in the train set: 83200, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.231899 -> initscore=-1.197619
[LightGBM] [Info] Start training from score -1.197619
LightGBM F1-score: 0.9997
LogisticRegression F1-score: 0.9997
GradientBoosting F1-score: 0.9997


In [73]:
final_model = XGBClassifier(n_estimators=500, use_label_encoder=False, eval_metric='logloss', random_state=42)
final_model.fit(X, y)
final_predictions = final_model.predict(X_test)

# Создание файла submission.csv
submission = pd.DataFrame({"id": test_ids, "failure_flag": final_predictions})
submission.to_csv("submission.csv", index=False)
print("Файл submission.csv создан!")


Файл submission.csv создан!
