In [1]:
# Импорты
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from pathlib import Path

In [2]:
# Перевод файлов из csv в dataframe, X_train, y_train
X_train_folder = Path("/kaggle/input/competition/data_hackaton/train/X")
y_train_folder = Path("/kaggle/input/competition/data_hackaton/train/y")
X_test_folder = Path("/kaggle/input/competition/data_hackaton/test/X")

X_train = pd.DataFrame()
y_train = pd.DataFrame()

for X_train_file_path in X_train_folder.iterdir():
    if X_train_file_path.name.endswith('csv'):
        X_train_file = pd.read_csv(X_train_file_path)
        X_train = pd.concat([X_train, X_train_file])
    
for y_train_file_path in y_train_folder.iterdir():
    if y_train_file_path.name.endswith('csv'):
        y_train_file = pd.read_csv(y_train_file_path)
        y_train = pd.concat([y_train, y_train_file])

#X_train, X_test_forus = train_test_split(X_train, test_size = 0.2, shuffle = False)
#y_train, y_test_forus = train_test_split(y_train, test_size = 0.2, shuffle = False)

y_train_lithology = y_train['lithology']
#y_test_lithology = y_test_forus["lithology"]

y_train_stratigraphy = y_train['stratigraphy']
#y_test_stratigraphy = y_test_forus['stratigraphy']

X_train.head()

Unnamed: 0,DEPT,GKIN,TH,U,K,KMV,IK,SM
0,3.5,6.839,5.364,7.364,80.364,97.425,26.438,61939.063
1,3.6,6.839,6.0,8.0,81.0,103.524,27.102,61961.078
2,3.7,6.761,8.909,10.182,62.091,119.533,29.099,61826.379
3,3.8,6.583,9.474,11.0,59.474,130.611,29.884,61762.465
4,3.9,6.583,8.0,11.0,72.0,141.463,29.783,61761.414


In [3]:
# Выбор features, создание pipeline с нормализацией
features = ["DEPT", "GKIN", "TH", "U", "K", "KMV", "IK", "SM"]
numeric_transformer = make_pipeline(StandardScaler())

preprocessor = make_column_transformer(
    (numeric_transformer, features),
)

In [4]:
# Тренировка модели рандомного леса для литологии 
pipe_rf_lith = make_pipeline(
    preprocessor, RandomForestClassifier(max_depth = 10, n_estimators = 10, random_state = 123, max_samples = 23000, min_samples_leaf = 4)
)
#pipe_rf_lith.fit(X_train, y_train_lithology).score(X_test_forus, y_test_lithology)
pipe_rf_lith.fit(X_train, y_train_lithology)

In [5]:
# Для стратиграфии
pipe_rf_strat = make_pipeline(
    preprocessor, RandomForestClassifier(max_depth = 10, n_estimators = 10, random_state=123, max_samples = 25000, min_samples_leaf = 15)
)
#pipe_rf_strat.fit(X_train, y_train_stratigraphy).score(X_test_forus, y_test_stratigraphy)
pipe_rf_strat.fit(X_train, y_train_stratigraphy)

In [6]:
# Важность переменных для измерения литологии
data = {
    "Importance": pipe_rf_lith.named_steps["randomforestclassifier"].feature_importances_,
}
rf_imp_df = pd.DataFrame(
    data=data,
    index=features,
).sort_values(by="Importance", ascending=False)
rf_imp_df

Unnamed: 0,Importance
SM,0.210868
K,0.199846
KMV,0.142385
GKIN,0.111663
DEPT,0.100505
IK,0.089794
TH,0.088012
U,0.056927


In [7]:
# Важность для стратиграфии
data = {
    "Importance": pipe_rf_strat.named_steps["randomforestclassifier"].feature_importances_,
}
rf_imp_df = pd.DataFrame(
    data=data,
    index=features,
).sort_values(by="Importance", ascending=False)
rf_imp_df

Unnamed: 0,Importance
SM,0.222979
K,0.2101
KMV,0.147965
GKIN,0.107461
DEPT,0.098076
TH,0.086563
IK,0.077998
U,0.048856


In [8]:
# Создание таблицы для сохранения предсказаний
prediction = pd.DataFrame()

for X_test_file_path in X_test_folder.iterdir():
    well_name = X_test_file_path.name
    # Предотвращение чтения скрытых папок
    if well_name.endswith("csv"):
        # Получение номера скважины
        well_id = int(well_name.split(".")[0].split("_")[1])
        # Чтение данных ГИС
        X_test = pd.read_csv(X_test_file_path)
        
        X_test["K-squared"] = X_test["K"] ** 2
        # Инференс модели определения литологии
        y_pred_lithology = pipe_rf_lith.predict(X_test[features])
        # Инференс модели определения стратиграфии
        y_pred_stratigraphy = pipe_rf_strat.predict(X_test[features])
        # Формирование таблицы с предсказаниями для скважины
        y_pred = pd.DataFrame({'DEPT': X_test['DEPT'], 'well_id': well_id, 'lithology': y_pred_lithology, 'stratigraphy': y_pred_stratigraphy})
        # Сортировка по глубине
        y_pred.sort_values(by='DEPT', inplace=True)
        # Добавление предсказаний по скважине в общую таблицу
        prediction = pd.concat([prediction, y_pred])     

In [9]:
print(y_pred)

      DEPT  well_id           lithology stratigraphy
0      3.3       26            Песчаник         J1uk
1      3.4       26            Песчаник         J1uk
2      3.5       26            Песчаник         J1uk
3      3.6       26            Песчаник         J1uk
4      3.7       26            Песчаник         J1uk
..     ...      ...                 ...          ...
989  102.5       26  Карбонатные породы         G3hl
990  102.6       26  Карбонатные породы         G3hl
991  102.7       26  Карбонатные породы         G3hl
992  102.8       26  Карбонатные породы         G3hl
993  102.9       26  Карбонатные породы         G3hl

[994 rows x 4 columns]


In [10]:
folder_results = Path("data/results_example")
folder_results.mkdir(parents=True, exist_ok=True)

In [11]:
# Переведем глубину в сантиметры
prediction['DEPT'] = (prediction['DEPT'] * 100).astype(int)

'''
Your solution's first column will be the ID Column, used to correlate rows between your solution and the participant's submission. 
Your solution must have unique values in this column.
'''

def get_id(well_id, dept):
    """
    Функция для создания id строки файла отправки 
    Из номера скважины и глубины в сантиметрах
    
    Важно для kaggle
    """
    return str(well_id) + "_" + str(dept)

prediction['ID'] = prediction.apply(lambda row: get_id(row['well_id'], row['DEPT']), axis=1)
# Важно сохранить в следующем порядке

prediction = prediction[['ID', 'well_id', 'lithology', 'stratigraphy']]

# Важно сохранить с аргументом index=False
prediction.to_csv(folder_results / "prediction_example.csv", index=False)