In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

In [None]:
df_train = pd.read_csv('train_oil.csv')
df_test = pd.read_csv('oil_test.csv')

In [None]:
df_train.head()

Unnamed: 0,Field name,Reservoir unit,Country,Region,Basin name,Tectonic regime,Latitude,Longitude,Operator company,Onshore/Offshore,Hydrocarbon type,Reservoir status,Structural setting,Depth,Reservoir period,Lithology,Thickness (gross average ft),Thickness (net pay average ft),Porosity,Permeability
0,ZHIRNOV,MELEKESKIAN,RUSSIA,FORMER SOVIET UNION,VOLGA-URAL,COMPRESSION/EVAPORITE,51.0,44.8042,NIZHNEVOLZHSKNET,ONSHORE,OIL,DECLINING PRODUCTION,FORELAND,1870,CARBONIFEROUS,SANDSTONE,262.0,33.0,24.0,30.0
1,LAGOA PARDA,LAGOA PARDA (URUCUTUCA),BRAZIL,LATIN AMERICA,ESPIRITO SANTO,EXTENSION,-19.6017,-39.8332,PETROBRAS,ONSHORE,OIL,NEARLY DEPLETED,PASSIVE MARGIN,4843,PALEOGENE,SANDSTONE,2133.0,72.0,23.0,350.0
2,ABQAIQ,ARAB D,SAUDI ARABIA,MIDDLE EAST,THE GULF,COMPRESSION/EVAPORITE,26.08,49.81,SAUDI ARAMCO,ONSHORE,OIL,REJUVENATING,FORELAND,6050,JURASSIC,LIMESTONE,250.0,184.0,21.0,410.0
3,MURCHISON,BRENT,UK /NORWAY,EUROPE,NORTH SEA NORTHERN,EXTENSION,61.3833,1.75,CNR,OFFSHORE,OIL,NEARLY DEPLETED,RIFT,8988,JURASSIC,SANDSTONE,425.0,300.0,22.0,750.0
4,WEST PEMBINA,NISKU (PEMBINA L POOL),CANADA,NORTH AMERICA,WESTERN CANADA,COMPRESSION,53.2287,-115.8008,NUMEROUS,ONSHORE,OIL,UNKNOWN,FORELAND,9306,DEVONIAN,DOLOMITE,233.0,167.0,11.8,1407.0


In [None]:
unique_counts = df_train.nunique()
print("Количество уникальных значений по столбцам:")
print(unique_counts)

Количество уникальных значений по столбцам:
Field name                        285
Reservoir unit                    258
Country                            42
Region                              7
Basin name                         93
Tectonic regime                    55
Latitude                          262
Longitude                         260
Operator company                  138
Onshore/Offshore                    3
Hydrocarbon type                    4
Reservoir status                   13
Structural setting                 43
Depth                             279
Reservoir period                   22
Lithology                          13
Thickness (gross average ft)      166
Thickness (net pay average ft)    166
Porosity                           68
Permeability                      151
dtype: int64


In [None]:
symbol = '/'
result = []

for col in df_train.columns:
    if df_train[col].astype(str).str.contains(symbol, na=False).any():
        result.append(col)

print(result)

find_cols_with_symbol = lambda df_train, sym: [col for col in df_train.columns
                                        if df_train[col].astype(str).str.contains(sym, na=False).any()]

['Reservoir unit', 'Country', 'Basin name', 'Tectonic regime', 'Operator company', 'Structural setting']


In [None]:
for col in df_train.select_dtypes(include=['object']).columns:
    df_train[col] = df_train[col].fillna('Unknown')

for col in df_test.select_dtypes(include=['object']).columns:
    df_test[col] = df_test[col].fillna('Unknown')

df_test.shape

(133, 19)

In [None]:
mapping = {
    'OFFSHORE': 0,
    'ONSHORE': 1,
    'ONSHORE-OFFSHORE': 2
}

df_train['Onshore/Offshore num'] = df_train['Onshore/Offshore'].map(mapping)

In [None]:
df_train['Onshore/Offshore'].value_counts()

Unnamed: 0_level_0,count
Onshore/Offshore,Unnamed: 1_level_1
ONSHORE,218
OFFSHORE,86
ONSHORE-OFFSHORE,5


In [None]:
df_train['Onshore/Offshore num'].value_counts()

Unnamed: 0_level_0,count
Onshore/Offshore num,Unnamed: 1_level_1
1,218
0,86
2,5


In [None]:
df_train['Hydrocarbon type num'] = pd.factorize(df_train['Hydrocarbon type'])[0]
df_train['Reservoir status num'] = pd.factorize(df_train['Reservoir status'])[0]

df_train['Lithology num'] = pd.factorize(df_train['Lithology'])[0]
df_train['Reservoir period num'] = pd.factorize(df_train['Reservoir period'])[0]

df_test['Hydrocarbon type num'] = pd.factorize(df_test['Hydrocarbon type'])[0]
df_test['Reservoir status num'] = pd.factorize(df_test['Reservoir status'])[0]

df_test['Lithology num'] = pd.factorize(df_test['Lithology'])[0]
df_test['Reservoir period num'] = pd.factorize(df_test['Reservoir period'])[0]

In [None]:
for col in ['Tectonic regime', 'Basin name', 'Structural setting']:
  df_train[col] = df_train[col].fillna('Unknown')
  df_train[col] = df_train[col].fillna('Unknown')

tmp_test_t = df_test['Tectonic regime'].str.get_dummies(sep='/')
tmp_test_t = tmp_test_t.add_prefix('Tectonic regime_').reset_index(drop=True)

tmp_test_b = df_test['Basin name'].str.get_dummies(sep='/')
tmp_test_b = tmp_test_b.add_prefix('Basin name_').reset_index(drop=True)

tmp_test_s = df_test['Structural setting'].str.get_dummies(sep='/')
tmp_test_s = tmp_test_s.add_prefix('Structural setting_').reset_index(drop=True)

In [None]:
tmp_t = df_train['Tectonic regime'].str.get_dummies(sep='/')
tmp_t = tmp_t.add_prefix('Tectonic regime_').reset_index(drop=True)

cols_t = [col for col in tmp_t if col.startswith('Tectonic regime_')]
print(cols_t)
print(len(cols_t))

for col in tmp_t[cols_t]:
  if col not in tmp_test_t:
    cols_t.remove(col)

print(cols_t)
print(len(cols_t))

df_train = pd.concat([df_train, tmp_t[cols_t]], axis=1)
df_test = pd.concat([df_test, tmp_test_t[cols_t]], axis=1)

['Tectonic regime_BASEMENT-I', 'Tectonic regime_COMPRESSION', 'Tectonic regime_DIAPIR', 'Tectonic regime_EROSION', 'Tectonic regime_EVAPORITE', 'Tectonic regime_EXTENSION', 'Tectonic regime_GRAVITY', 'Tectonic regime_INVERSION', 'Tectonic regime_LINKED', 'Tectonic regime_REACTIVATION', 'Tectonic regime_SHALE', 'Tectonic regime_STRIKE-SLIP', 'Tectonic regime_SYNSEDIMENTATION', 'Tectonic regime_TRANSPRESSION', 'Tectonic regime_TRANSTENSION', 'Tectonic regime_UPLIFT']
16
['Tectonic regime_BASEMENT-I', 'Tectonic regime_COMPRESSION', 'Tectonic regime_DIAPIR', 'Tectonic regime_EROSION', 'Tectonic regime_EVAPORITE', 'Tectonic regime_EXTENSION', 'Tectonic regime_GRAVITY', 'Tectonic regime_INVERSION', 'Tectonic regime_LINKED', 'Tectonic regime_SHALE', 'Tectonic regime_STRIKE-SLIP', 'Tectonic regime_SYNSEDIMENTATION', 'Tectonic regime_TRANSPRESSION', 'Tectonic regime_TRANSTENSION', 'Tectonic regime_UPLIFT']
15


In [None]:
tmp_b = df_train['Basin name'].str.get_dummies(sep='/')
tmp_b = tmp_b.add_prefix('Basin name_').reset_index(drop=True)

cols_b = [col for col in tmp_t if col.startswith('Basin name_')]
print(cols_b)
print(len(cols_b))

for col in tmp_b[cols_b]:
  if col not in tmp_test_b:
    cols_b.remove(col)

print(cols_b)
print(len(cols_b))

df_train = pd.concat([df_train, tmp_b[cols_b]], axis=1)
df_test = pd.concat([df_test, tmp_test_b[cols_b]], axis=1)

[]
0
[]
0


In [None]:
tmp_s = df_train['Structural setting'].str.get_dummies(sep='/')
tmp_s = tmp_s.add_prefix('Structural setting_').reset_index(drop=True)

cols_s = [col for col in tmp_s if col.startswith('Structural setting_')]
print(cols_s)
print(len(cols_s))

for col in tmp_s[cols_s]:
  if col not in tmp_test_s:
    cols_s.remove(col)

print(cols_s)
print(len(cols_s))

df_train = pd.concat([df_train, tmp_s[cols_s]], axis=1)
df_test = pd.concat([df_test, tmp_test_s[cols_s]], axis=1)

['Structural setting_BACKARC', 'Structural setting_DELTA', 'Structural setting_FOREARC', 'Structural setting_FORELAND', 'Structural setting_INTRACRATONIC', 'Structural setting_INVERSION', 'Structural setting_PASSIVE MARGIN', 'Structural setting_RIFT', 'Structural setting_SALT', 'Structural setting_SUB-SALT', 'Structural setting_SUB-THRUST', 'Structural setting_THRUST', 'Structural setting_WRENCH']
13
['Structural setting_BACKARC', 'Structural setting_DELTA', 'Structural setting_FOREARC', 'Structural setting_FORELAND', 'Structural setting_INTRACRATONIC', 'Structural setting_INVERSION', 'Structural setting_PASSIVE MARGIN', 'Structural setting_RIFT', 'Structural setting_SALT', 'Structural setting_SUB-SALT', 'Structural setting_SUB-THRUST', 'Structural setting_THRUST', 'Structural setting_WRENCH']
13


In [None]:
tect_reg = [col for col in df_train.columns if col.startswith('Tectonic regime_')]
str_set = [col for col in df_train.columns if col.startswith('Structural setting_')]
b_name = [col for col in df_train.columns if col.startswith('Basin name_')]

In [None]:
features = ['Thickness (net pay average ft)', 'Longitude', 'Permeability'] + tect_reg + b_name + str_set #убрать стр сет?

X = df_train[features]
y = df_train['Onshore/Offshore num']

X_filled = X.fillna(X.median())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_filled)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)
model = LogisticRegression(
    max_iter=2000,
    solver='newton-cg',
    multi_class='multinomial',
    random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f'F1-score: {f1:.4f}')

F1-score: 0.7643




In [None]:
X_train = df_train[features]
X_test = df_test[features]
X_train_filled = X_train.fillna(X_train.median())
X_test_filled = X_test.fillna(X_test.median())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train_filled)
X_test_scaled = scaler.transform(X_test_filled)

model = LogisticRegression(
    max_iter=2000,
    solver='newton-cg',
    random_state=42
)

model.fit(X_scaled, y)

y_test_pred = model.predict(X_test_scaled)

In [None]:
df_vanga = pd.DataFrame({
    'index': df_test.index,
    'Onshore/Offshore': y_test_pred})
df_vanga['Onshore/Offshore'].value_counts()

Unnamed: 0_level_0,count
Onshore/Offshore,Unnamed: 1_level_1
1,91
0,41
2,1


In [None]:
import zipfile
from IPython.display import FileLink
df_vanga.to_csv('prediction_dt_5.csv', index=False)
with zipfile.ZipFile('prediction_dz5.zip', 'w') as zipf:
    zipf.write('prediction_dt_5.csv')