In [10]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree
from sklearn.feature_selection import SelectKBest, mutual_info_regression

Предположим, что мы исследуем объекты на территории России. Тогда логично, что чем ближе объект находится к центру (в данном случае за центр я взяла Москву), тем выше его score. Сгенерируем с помощью библиотеки numpy широту и долготу объектов и рассчитаем успешность как экспоненциальную функцию.

In [11]:
moscow_lat, moscow_lon = 55.7558, 37.6173


# координаты в пределах России
def generate_coordinates(n, lat_range=(41.2, 81.8), lon_range=(19.6, 169.6)):
    lats = np.random.uniform(lat_range[0], lat_range[1], n)
    lons = np.random.uniform(lon_range[0], lon_range[1], n)
    return lats, lons


def calculate_score(lat, lon, center_lat, center_lon, scale=1):
    distance = np.sqrt((lat - center_lat)**2 + (lon - center_lon)**2)
    score = np.exp(-distance / scale) * 10  # Масштабирование оценки
    return score

In [12]:
n_train, n_test = 6000, 2000
train_lat, train_lon = generate_coordinates(n_train)
test_lat, test_lon = generate_coordinates(n_test)

train_scores = calculate_score(train_lat, train_lon, moscow_lat, moscow_lon, scale=20)

train_data = pd.DataFrame({
    'id': range(1, n_train + 1),
    'lat': train_lat,
    'lon': train_lon,
    'score': train_scores
})

test_data = pd.DataFrame({
    'id': range(n_train + 1, n_train + n_test + 1),
    'lat': test_lat,
    'lon': test_lon
})

Чтобы данные были реалистичны, я решила взять за основу признаков крупные российские города, они имеют разные климатические условия, уровень населенности и тд. Эти факторы могут влиять на score объекта

In [13]:
cities = {
    "Москва": (55.7558, 37.6173),
    "Санкт-Петербург": (59.9343, 30.3351),
    "Новосибирск": (55.0084, 82.9357),
    "Екатеринбург": (56.8389, 60.6057),
    "Владивосток": (43.1155, 131.8855)
}

n_features = 1500
feature_columns = ['lat', 'lon'] + [f'feature_{i}' for i in range(1, 363)]

features_data = []
for city, (lat, lon) in cities.items():
    for _ in range(n_features // len(cities)):
        features = [lat + np.random.uniform(-0.1, 0.1), lon + np.random.uniform(-0.1, 0.1)] + list(np.random.rand(362))
        features_data.append(features)

features_df = pd.DataFrame(features_data, columns=feature_columns)

features_df.head()

Unnamed: 0,lat,lon,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_353,feature_354,feature_355,feature_356,feature_357,feature_358,feature_359,feature_360,feature_361,feature_362
0,55.75097,37.528433,0.748764,0.534412,0.713952,0.413468,0.055982,0.341215,0.625171,0.993615,...,0.827515,0.932463,0.120518,0.474003,0.55052,0.982528,0.067943,0.005939,0.384342,0.742342
1,55.733319,37.527573,0.965449,0.929301,0.648328,0.515751,0.544999,0.240769,0.43242,0.334691,...,0.139803,0.035895,0.53938,0.448616,0.835377,0.517999,0.743552,0.351059,0.297493,0.170514
2,55.735547,37.560298,0.312489,0.637914,0.188771,0.017101,0.752332,0.756429,0.319997,0.108982,...,0.35419,0.723348,0.738096,0.026545,0.545133,0.968199,0.142701,0.949149,0.121588,0.072102
3,55.833085,37.593933,0.70199,0.418745,0.802447,0.400539,0.339226,0.52461,0.640888,0.431793,...,0.38455,0.831233,0.652049,0.69946,0.926062,0.187992,0.056849,0.693538,0.428142,0.822155
4,55.720236,37.588949,0.562642,0.828641,0.574713,0.948201,0.095328,0.367455,0.955967,0.307025,...,0.371109,0.515216,0.261017,0.595625,0.641022,0.388709,0.236992,0.662758,0.656691,0.567994


Для объединения признаков c тренировочной и тестовой выборками используем KD-дерево для поиска ближайших соседей, чтобы привязать наиболее релевантные признаки на основе географического положения

In [14]:
features_tree = KDTree(features_df[['lat', 'lon']])

# поиск ближайших соседей для train и test
distances, indices = features_tree.query(train_data[['lat', 'lon']], k=1)
train_features = features_df.iloc[indices.flatten()].drop(['lat', 'lon'], axis=1).reset_index(drop=True)
train_data_with_features = pd.concat([train_data.reset_index(drop=True), train_features], axis=1)

distances, indices = features_tree.query(test_data[['lat', 'lon']], k=1)
test_features = features_df.iloc[indices.flatten()].drop(['lat', 'lon'], axis=1).reset_index(drop=True)
test_data_with_features = pd.concat([test_data.reset_index(drop=True), test_features], axis=1)

print(train_data_with_features.head())

   id        lat         lon     score  feature_1  feature_2  feature_3  \
0   1  47.363091   94.791319  0.556131   0.551721   0.643978   0.094646   
1   2  41.247622  157.990729  0.023292   0.046942   0.749053   0.163850   
2   3  50.372403   82.332874  1.051967   0.155577   0.551471   0.903566   
3   4  47.378004   96.401106  0.513603   0.551721   0.643978   0.094646   
4   5  54.406730   34.885669  8.587047   0.729498   0.092639   0.254549   

   feature_4  feature_5  feature_6  ...  feature_353  feature_354  \
0   0.189212   0.340704   0.782056  ...     0.126446     0.904542   
1   0.465390   0.809968   0.639483  ...     0.282136     0.583593   
2   0.188718   0.750478   0.920645  ...     0.703121     0.475467   
3   0.189212   0.340704   0.782056  ...     0.126446     0.904542   
4   0.767611   0.416228   0.962926  ...     0.157616     0.586673   

   feature_355  feature_356  feature_357  feature_358  feature_359  \
0     0.065760     0.729403     0.246554     0.423575     0.5093

Сразу сделаем отбор признаков с помощью SelectKBest из библиотеки scikit-learn, посколько использование всех признаков может только ухудшить обобщаемость модели

In [15]:
# выберем 25 наиболее важных признаков на основе взаимной информации
selector = SelectKBest(mutual_info_regression, k=25).fit(train_data_with_features.drop(labels=["lat", "lon", "score", "id"], 
                                                                                       axis=1), train_data_with_features["score"])

train_selected = selector.transform(train_data_with_features.drop(labels=["lat", "lon", "score", "id"], axis=1))
test_selected = selector.transform(test_data_with_features.drop(labels=["lat", "lon", "id"], axis=1)) 

selected_features_indices = selector.get_support(indices=True)
selected_features_names = train_data_with_features.columns[selected_features_indices]

In [16]:
train_selected_df = pd.DataFrame(train_selected, columns=selected_features_names)
test_selected_df = pd.DataFrame(test_selected, columns=selected_features_names)

train_selected_df["id"] = train_data_with_features["id"].values
test_selected_df["id"] = test_data_with_features["id"].values
train_selected_df["lat"] = train_data_with_features["lat"].values
test_selected_df["lat"] = test_data_with_features["lat"].values
train_selected_df["lon"] = train_data_with_features["lon"].values
test_selected_df["lon"] = test_data_with_features["lon"].values
train_selected_df["score"] = train_data_with_features["score"].values

In [17]:
train_selected_df, test_selected_df

(            lat  feature_9  feature_32  feature_41  feature_48  feature_65  \
 0     47.363091   0.645621    0.684492    0.725729    0.532188    0.748397   
 1     41.247622   0.725057    0.156720    0.571495    0.510533    0.339930   
 2     50.372403   0.666771    0.428295    0.837618    0.292926    0.969829   
 3     47.378004   0.645621    0.684492    0.725729    0.532188    0.748397   
 4     54.406730   0.794982    0.485213    0.329114    0.926661    0.791215   
 ...         ...        ...         ...         ...         ...         ...   
 5995  56.812481   0.994350    0.647559    0.401666    0.368913    0.362023   
 5996  52.231624   0.784893    0.625429    0.007308    0.065225    0.123571   
 5997  67.751646   0.662270    0.894095    0.597434    0.375843    0.993707   
 5998  43.673305   0.645621    0.684492    0.725729    0.532188    0.748397   
 5999  81.478674   0.072911    0.919788    0.950792    0.493826    0.941707   
 
       feature_69  feature_72  feature_89  feature

In [21]:
df_population = pd.read_excel('dem.xlsx')
df_population = pd.DataFrame(df_population, columns=['Субъект', 'Все население', 'Широта', 'Долгота'])
print(df_population)

                         Субъект  Все население   Широта   Долгота
0           Белгородская область        1514527  50.5900   36.5900
1               Брянская область        1152505  52.9600   32.4000
2           Владимирская область        1325510  55.9670   40.6250
3            Воронежская область        2285282  51.6800   39.2300
4             Ивановская область         914725  57.0100   41.3100
..                           ...            ...      ...       ...
81              Амурская область         756198  50.2907  127.5272
82           Магаданская область         134315  59.5682  150.8085
83           Сахалинская область         460535  46.9591  142.7380
84  Еврейская автономная область         147458  48.7946  132.9218
85    Чукотский автономный округ          47840  64.7337  177.5110

[86 rows x 4 columns]


https://rosstat.gov.ru/compendium/document/13282
Добавим данные населения для областей. 

In [22]:
df_population.columns = ['subject', 'population', 'lat', 'lon']
df_population

Unnamed: 0,subject,population,lat,lon
0,Белгородская область,1514527,50.5900,36.5900
1,Брянская область,1152505,52.9600,32.4000
2,Владимирская область,1325510,55.9670,40.6250
3,Воронежская область,2285282,51.6800,39.2300
4,Ивановская область,914725,57.0100,41.3100
...,...,...,...,...
81,Амурская область,756198,50.2907,127.5272
82,Магаданская область,134315,59.5682,150.8085
83,Сахалинская область,460535,46.9591,142.7380
84,Еврейская автономная область,147458,48.7946,132.9218


In [26]:
tree_pop = KDTree(df_population[['lat', 'lon']])

distances, indices = tree_pop.query(train_selected_df[['lat', 'lon']], k=1)
train_pop = df_population.iloc[indices.flatten()].drop(['lat', 'lon'], axis=1).reset_index(drop=True)
train_data_with_population = pd.concat([train_selected_df.reset_index(drop=True), train_pop], axis=1)
train_data_with_population.drop(labels=['subject'], axis=1, inplace=True)

distances, indices = tree_pop.query(test_selected_df[['lat', 'lon']], k=1)
test_pop = df_population.iloc[indices.flatten()].drop(['lat', 'lon'], axis=1).reset_index(drop=True)
test_data_with_population = pd.concat([test_selected_df.reset_index(drop=True), test_pop], axis=1)
test_data_with_population.drop(labels=['subject'], axis=1, inplace=True)

print(train_data_with_population.head())

         lat  feature_9  feature_32  feature_41  feature_48  feature_65  \
0  47.363091   0.645621    0.684492    0.725729    0.532188    0.748397   
1  41.247622   0.725057    0.156720    0.571495    0.510533    0.339930   
2  50.372403   0.666771    0.428295    0.837618    0.292926    0.969829   
3  47.378004   0.645621    0.684492    0.725729    0.532188    0.748397   
4  54.406730   0.794982    0.485213    0.329114    0.926661    0.791215   

   feature_69  feature_72  feature_89  feature_90  ...  feature_197  \
0    0.895157    0.061549    0.316922    0.212827  ...     0.739310   
1    0.663991    0.855951    0.588857    0.452358  ...     0.550212   
2    0.446264    0.947559    0.322451    0.288618  ...     0.562044   
3    0.895157    0.061549    0.316922    0.212827  ...     0.739310   
4    0.328090    0.896639    0.740668    0.362391  ...     0.592237   

   feature_224  feature_238  feature_248  feature_263  feature_312  id  \
0     0.439354     0.283354     0.063036     0.4

In [27]:
train_data_with_population.to_csv('train.csv', index=False)
test_data_with_population.to_csv('test.csv', index=False)
features_df.to_csv('features.csv', index=False)