In [None]:
# Загрузка библиотек
import pandas as pd
import sqlalchemy as sa
from sqlalchemy import text
import pickle
import os
import joblib
from lime import lime_tabular
from sklearn.preprocessing import StandardScaler
from datetime import datetime as dt
from sklearn.preprocessing import PolynomialFeatures
from datetime import datetime, timedelta


wells_df = pd.read_excel('tags_2025.xlsx', sheet_name='WELLS')

wells_df.columns = wells_df.columns.str.strip()

wells_df['wellid'] = wells_df['wellid'].astype(str)

target_wellid = "2" 

row = wells_df.loc[wells_df['wellid'] == target_wellid]
wellid = target_wellid

if not row.empty:
    unit = row['unit'].values[0]
    site = row['site'].values[0]
    wellpad = row['wellpad'].values[0]
    well = str(row['well'].values[0])
    mid = str(row['mid'].values[0])

    print(f"Цех: {unit}, Участок: {site}, Сборный пункт: {wellpad}, скважина: {well}, MID: {mid}, wellid: {wellid}" )
else:
    print(f"WellID {target_wellid} not found in the data.")


In [2]:
import sqlalchemy as sa

engine = sa.create_engine(
    f"postgresql://sarah_user:allineedisosh@SRV-IMMO0101UZ/SARAH"
)

In [3]:
# Получение данных с БД
wells_features_query = text("""
    SELECT * FROM wells.well_features
    WHERE wellid = 2 AND mid = :mid AND datetime >= '2025-03-08' AND datetime <= '2025-03-13'
    ORDER BY datetime ASC
""")

well_general_query = text("""
    SELECT * FROM wells.general_features
    WHERE mid = :mid AND datetime >= '2025-03-08' AND datetime <= '2025-03-13'
    ORDER BY datetime ASC
""")



with engine.connect() as conn:
    wells_features = pd.read_sql(wells_features_query, conn, params={'well': well, 'mid': mid})
    well_general = pd.read_sql(well_general_query, conn, params={'mid': mid})


In [1]:
#Подготовка формата данных 
def convert_to_float(df):
    for col in df.columns:
        if col != 'datetime':
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

In [5]:
#Подготовка формата данных 
wells_features = convert_to_float(wells_features)
well_general = convert_to_float(well_general)

In [7]:
# Объединяем данные в один датафррейм 
wells_features['datetime'] = pd.to_datetime(wells_features['datetime'])

well_general['datetime'] = pd.to_datetime(well_general['datetime'])

combined_1 = pd.merge(wells_features, well_general, on=['datetime', 'mid'], how='inner')
df = combined_1

def convert_to_float(df):
    for col in df.columns:
        if col != 'datetime':
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

In [8]:
# Удаляем столбцы с большим числом пропусков
threshold_df = len(df) * 0.5
df = df.dropna(axis=1, thresh=threshold_df)

In [None]:
# Генерация дополнительных признаков
df['delta_t'] = 0
for index, row in df.iterrows():
    if 't1' in df.columns:
        df.loc[index, 'delta_t'] = row['t1'] - row['t9']
    elif 't2' in df.columns:
        df.loc[index, 'delta_t'] = row['t2'] - row['t9']
    elif 't3' in df.columns:
        df.loc[index, 'delta_t'] = row['t3'] - row['t9']

In [None]:
# Генерация дополнительных признаков
df['delta_p'] = 0
for index, row in df.iterrows():
    if 'p1' in df.columns:
        df.loc[index, 'delta_p'] = row['p1'] - row['p3']
    elif 'p2' in df.columns:
        df.loc[index, 'delta_p'] = row['p2'] - row['p3']
    elif 'p3' in df.columns:
        df.loc[index, 'delta_p'] = row['p3'] - row['p4']


In [12]:
# Очистка данных, для повышения качества предикта и исключения аномалий
thresholds = {
    'd1': (0, 1000),
    'h1': (0, 100),
    'p1': (3, 15),
    'p2': (3, 15),
    'p3': (3, 15),
    'p4': (3, 12),
    'p5': (0, 120),
    'p6': (0, 120),
    'p7': (3, 6),
    't1': (0, 75),
    't2': (0, 100),
    't3': (0, 75),
    't4': (0, 40),
    't7': (10, 75),
    't8': (0, 50),
    't9': (0, 75),
    'v1': (0, 100)
}

for column, (min_val, max_val) in thresholds.items():
    if column in df.columns:
        df = df[(df[column] > min_val) & (df[column] < max_val)]

In [14]:
# One Hot Encoding для определения временного коэффициента
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df['hour'] = df['hour'].apply(lambda x: 24 if x == 0 else x)
current_hour = df['hour'].values[0]

In [15]:
# Создаем DataFrame с нулями для всех часов
one_hot = pd.DataFrame(0, index=df.index, columns=[f'h_{i}' for i in range(1, 25)])

# Устанавливаем 1 для текущего часа
one_hot[f'h_{current_hour}'] = 1

df = pd.concat([df, one_hot], axis=1)

In [16]:
rw=df

In [17]:
# Удаляем лишние столбцы, подготовка для предикта
df = df.drop(['datetime','wellid','hour','verification_x','verification_y'], axis=1)

In [19]:
# Открываем обученную ранее модель  
with open('CatBoost_degree_2.pkl', 'rb') as file:
    loaded_lasso_degree_2 = pickle.load(file)

In [20]:
# Делаем предикт
predictions = loaded_lasso_degree_2.predict(df)
predictions = predictions * 24


In [None]:
# Формиуем датафейм для загрузки в БД
final_df = pd.DataFrame({
    'datetime': rw['datetime'],
    'wellid': wellid,
    'gasrate': predictions
})
print(final_df)