In [1]:
from functools import reduce
import re
import pandas as pd
import numpy as np
from scipy.spatial.distance import mahalanobis
from scipy.optimize import linear_sum_assignment
import warnings

warnings.filterwarnings('ignore')

In [2]:
events1 = pd.read_csv('новые_протесты_рус3.csv')
events2 = pd.read_csv('росстат_общий_new1.csv')


events1['Date'] = pd.to_datetime(events1['Date'], errors='coerce')
events1['year'] = events1['Date'].dt.year  
events1['Event text'] = 1

corrections = {
    "Кемерово":        ("Кемеровская",     "Кемеровский",   32701000),
    "Пермь":           ("Пермский",        "Пермский",      57701000),
    "Петрозаводск":    ("Карелия",         "Петрозаводский",86701000),
    "Казань":          ("Татарстан",       "Казань",        92701000),
    "Владивосток":     ("Приморский",      "Владивостокский",5701000),
    "Ростов":          ("Ярославская",     "Ростовский",    78637000),
    "Нижний Новгород": ("Нижегородская",   "Нижний Новгород",22701000),
    "Орёл":            ("Орловская",       "Орёл",          54701000),
    "Якутск":          ("Саха",            "Якутск",        98701000),
    "Барнаул":         ("Алтайский",       "Барнаул",       1701000),
    "Челябинск":       ("Челябинская",     "Челябинский",   75701000),
    "Магас":           ("Ингушетия",       "Магас",         26701000),
    "Черкесск":        ("Карачаево-Черкесская","Черкесский",91701000),
    "Элиста":          ("Калмыкия",        "Элиста",        85701000),
    "Тула":            ("Тульская",        "Тула",          70701000),
    "Кострома":        ("Костромская",     "Кострома",      34701000),
    "Грозный":         ("Чеченская",       "Грозный",       96701000),
    "Саранск":         ("Мордовия",        "Саранск",       89701000),
    "Волчанск":        ("Свердловская",    "Волчанский",    65735000),
    "Сочи":            ("Краснодарский",   "Сочи",          3726000),
    "Сыктывкар":       ("Коми",            "Сыктывкар",     87701000),
    "Магнитогорск":    ("Челябинская",     "Магнитогорский",75738000),
    "Чита":            ("Забайкальский",   "Чита",          76701000),
    "Невьянск":        ("Свердловская",    "Невьянский",    65714000),
    "Кормиловка":      ("Омская",          "Кормиловский",  52623000),
    "Златоуст":        ("Челябинская",     "Златоустовский",75712000),
    "Вологда":         ("Вологодская",     "Вологда",       19701000),
    "Киров-Чепецк":    ("Кировская",       "Кирово-Чепецк", 33707000),
    "Ростов-на-Дону":  ("Ростовская",      "Ростов-на-Дону",60701000),
    "Реж":             ("Свердловская",    "Режевской",     65720000),
    "Качканар":        ("Свердловская",    "Качканарский",  65743000),
    "Нижний Тагил":    ("Свердловская",    "Нижний Тагил",  65751000),
    "Бакал":           ("Башкортостан",    "Бакалинский",   80607000),
}

# 3) Применяем коррекцию
for wrong_name, (new_region, new_municipality, new_oktmo) in corrections.items():
    mask = events1['municipality'] == wrong_name
    events1.loc[mask, 'region_name']  = new_region
    events1.loc[mask, 'municipality'] = new_municipality
    events1.loc[mask, 'oktmo']        = new_oktmo





df = events2.merge(
    events1[['region_name','municipality', 'year', '# protesters', 'Event text']],
    on=['municipality', 'region_name', 'year'],
    how='left'
)


df = df[df['region_name'] != 'Москва'].reset_index(drop=True)
df = df[df['region_name'] != 'Санкт-Петербург'].reset_index(drop=True)
# df = df[df['region_name'] != 'Московская'].reset_index(drop=True)
df.to_csv('общие_данные_2010t3+.csv', index=False)
df

Unnamed: 0,year,region_name,municipality,oktmo,Жилье (1000 м2),Дороги (%),Канализация (1 м),Водопровод (1 метр),Население,Прирост,...,Рождаемость,Смертность,Родившихся,Инвестиции,СреднегодН,Квартиры,Зарплата,Земля,# protesters,Event text
0,2006,Тверская,,28602100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,2006,Тверская,,28602400,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,2006,Тверская,,28604100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,2006,Тверская,,28604400,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,2006,Тверская,,28606100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625532,2024,Калмыкия,Яшкульский,85654000,0.0,0.0,0.0,0.0,15057.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
625533,2024,Курская,им,38621153,0.0,0.0,0.0,0.0,7684.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
625534,2024,Хабаровский,имени Лазо,8624000,0.0,0.0,0.0,0.0,37861.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
625535,2024,Хабаровский,имени Полины Осипенко,8637000,0.0,0.0,0.0,0.0,3532.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [3]:
events2 = pd.read_csv('общие_данные_2010t3+.csv')
protests = events2[events2['Event text'] == 1]
protests_0910 = protests[protests['year'].isin([2011, 2012, 2013])].copy()

# Для каждого oktmo записываем минимальный год протеста в 2009–2010
protests_0910['fprotets_year'] = protests_0910.groupby('municipality')['year'].transform('min')

# Оставляем по одной строке на oktmo (колонка fprotets_year уже заполнена правильно)
protests1 = protests_0910.drop_duplicates(subset=['municipality']).reset_index(drop=True)



protests.to_csv('протесты_2010t3+.csv', index=False)
protests1.to_csv('протесты2009_2013_2010t3+.csv', index=False)
protests1

Unnamed: 0,year,region_name,municipality,oktmo,Жилье (1000 м2),Дороги (%),Канализация (1 м),Водопровод (1 метр),Население,Прирост,...,Смертность,Родившихся,Инвестиции,СреднегодН,Квартиры,Зарплата,Земля,# protesters,Event text,fprotets_year
0,2011,Архангельская,Архангельск,11701000,538.2,2.94,2400.0,5900.0,355623.0,-0.7,...,11.8,3957.0,29587.00,356073.0,417.00,27871.7,29445.0,200.0,1.0,2011
1,2011,Астраханская,Астрахань,12701000,1655.9,38.00,2000.0,4000.0,520399.0,0.0,...,12.6,6832.0,36580.90,522893.0,401.81,20591.8,20869.0,0.0,1.0,2011
2,2011,Алтайский,Барнаул,1701000,86.0,0.00,100.0,5400.0,671201.0,0.0,...,11.5,8007.0,24189.00,676332.0,397.10,18573.8,93950.0,0.0,1.0,2011
3,2011,Белгородская,Белгород,14701000,139.1,0.00,0.0,100.0,357655.0,0.0,...,11.0,4131.0,50976.60,361882.0,403.00,21462.1,15310.0,0.0,1.0,2011
4,2011,Амурская,Благовещенск,10701000,96.4,0.00,600.0,500.0,219818.0,0.0,...,11.0,2662.0,114303.00,220477.0,391.20,27241.4,32097.0,1.0,1.0,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,2012,Смоленская,Смоленск,66701000,48.5,0.00,600.0,2300.0,329944.0,-2.7,...,13.6,3591.0,16827.00,330457.0,0.00,22546.1,16635.0,40.0,1.0,2012
59,2013,Липецкая,Липецк,42701000,68.2,48.29,100.0,1000.0,509098.0,-1.6,...,12.7,5645.0,44511.00,509409.0,0.00,27935.5,33015.0,0.0,1.0,2013
60,2013,Ингушетия,Магас,26701000,0.0,0.00,0.0,500.0,4106.0,2.9,...,0.5,15.0,0.00,4431.0,0.00,0.0,1262.6,1000.0,1.0,2013
61,2013,Адыгея,Майкоп,79701000,2.5,0.00,0.0,2740.0,167559.0,-0.3,...,12.9,2120.0,10137.49,167590.0,0.00,22334.4,28220.0,1.0,1.0,2013


In [4]:
lol = pd.read_csv("протесты_2010t3+.csv") 
sobytia = pd.read_csv("общие_данные_2010t3+.csv")

noprotestALL = sobytia[~sobytia['oktmo'].isin(lol['oktmo'])].reset_index(drop=True)
noprotestALL.to_csv('единицы_без_протестов_2010t3+.csv', index=False)
noprotestALL

Unnamed: 0,year,region_name,municipality,oktmo,Жилье (1000 м2),Дороги (%),Канализация (1 м),Водопровод (1 метр),Население,Прирост,...,Рождаемость,Смертность,Родившихся,Инвестиции,СреднегодН,Квартиры,Зарплата,Земля,# protesters,Event text
0,2006,Тверская,,28602100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,2006,Тверская,,28602400,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,2006,Тверская,,28604100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
3,2006,Тверская,,28604400,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
4,2006,Тверская,,28606100,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619604,2024,Калмыкия,Яшкульский,85654000,0.0,0.0,0.0,0.0,15057.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
619605,2024,Курская,им,38621153,0.0,0.0,0.0,0.0,7684.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
619606,2024,Хабаровский,имени Лазо,8624000,0.0,0.0,0.0,0.0,37861.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
619607,2024,Хабаровский,имени Полины Осипенко,8637000,0.0,0.0,0.0,0.0,3532.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [5]:
# df = pd.read_csv('протесты_2010t3+.csv')


# # 3. Преобразуем количество протестующих в числовой формат
# df['# protesters'] = pd.to_numeric(df['# protesters'], errors='coerce').fillna(0)

# # 4. Фильтруем данные по годам 2009-2010 и 2007
# df_2009_2010 = df[df['year'].isin([2011, 2012, 2013])]
# df_2007 = df[df['year'] == 2010]

# # 5. Вычисляем среднее количество протестующих для периода 2009-2010
# mean_2009_2010 = df_2009_2010.groupby('municipality', as_index=False)['# protesters'].mean()
# mean_2009_2010.rename(columns={'# protesters': 'mean_protesters_2009_2010'}, inplace=True)

# # 6. Вычисляем среднее количество протестующих за 2007 год
# mean_2007 = df_2007.groupby('municipality', as_index=False)['# protesters'].mean()
# mean_2007.rename(columns={'# protesters': 'mean_protesters_2007'}, inplace=True)

# # 7. Объединяем результаты по municipal (oktmo)
# merged = pd.merge(mean_2009_2010, mean_2007, on='municipality', how='inner')

# # 8. Фильтруем муниципалитеты, где среднее количество протестующих за 2009-2010 больше, чем за 2007
# result = merged[merged['mean_protesters_2009_2010'] > merged['mean_protesters_2007']]

# # 9. Получаем исходные данные для выбранных муниципалитетов и удаляем дубликаты
# final_df = pd.merge(result['municipality'], df_2009_2010, on='municipality', how='inner').drop_duplicates('municipality').reset_index(drop=True)

# # 10. Сохраняем результаты в новый файл
# final_df.to_csv('без_протестов2009_2010t3+.csv', index=False)
# final_df



In [None]:
PRE_YEARS  = [2010]                   # препротестный период
POST_YEARS = [2011, 2012, 2013]       # протестный период
MIN_POST_EVENTS = 1
TARGET = 100


df = pd.read_csv("протесты_2010t3+.csv" )
df['# protesters'] = pd.to_numeric(df['# protesters'], errors='coerce').fillna(0)
df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
df['event_flag'] = ((df.get('Event text', np.nan) == 1) | (df['# protesters'] > 0)).astype(int)

#  Агрегаты 
def aggregate_for_years(sub_years, prefix):
    sub = df[df['year'].isin(sub_years)].copy()
    agg = (
        sub.groupby('municipality', as_index=False)
           .agg(**{
               f'{prefix}_events': ('event_flag', 'sum'),
               f'{prefix}_sum':    ('# protesters', 'sum'),
               f'{prefix}_mean':   ('# protesters', 'mean')
           })
    )
    return agg

pre  = aggregate_for_years(PRE_YEARS,  'pre')
post = aggregate_for_years(POST_YEARS, 'post')
agg = pd.merge(pre, post, on='municipality', how='outer').fillna(0)

# Скор
agg['score'] = (
    0.50 * np.log((agg['post_mean'] + 1) / (agg['pre_mean'] + 1)) +
    0.30 * np.log1p(agg['post_sum']) -
    0.20 * np.log1p(agg['pre_sum'])
)

agg = agg[agg['post_events'] >= MIN_POST_EVENTS]



top_munis = (
    agg.sort_values('score', ascending=False)
       .head(TARGET)
       .loc[lambda x: x['score'] >= 0, 'municipality']
       .unique()
)


df_post = df[df['year'].isin(POST_YEARS)]
final_df = (
    df_post[df_post['municipality'].isin(top_munis)]
    .drop_duplicates(subset='municipality')
    .reset_index(drop=True)
)

print(f"Выбрано {final_df['municipality'].nunique()} муниципалитетов.")
final_df.to_csv('без_протестов2009_2010t3+.csv', index=False)
final_df


Выбрано 42 муниципалитетов.


Unnamed: 0,year,region_name,municipality,oktmo,Жилье (1000 м2),Дороги (%),Канализация (1 м),Водопровод (1 метр),Население,Прирост,...,Смертность,Родившихся,Инвестиции,СреднегодН,Квартиры,Зарплата,Земля,# protesters,Event text,event_flag
0,2011,Астраханская,Астрахань,12701000,1655.9,38.0,2000.0,4000.0,520399.0,0.0,...,12.6,6832.0,36580.9,522893.0,401.81,20591.8,20869.0,0.0,1.0,1
1,2011,Белгородская,Белгород,14701000,139.1,0.0,0.0,100.0,357655.0,0.0,...,11.0,4131.0,50976.6,361882.0,403.0,21462.1,15310.0,0.0,1.0,1
2,2011,Брянская,Брянск,15701000,98.2,0.0,1200.0,2500.0,434560.0,0.0,...,13.8,4531.0,10866.6,433297.0,439.0,18135.8,18643.0,0.0,1.0,1
3,2011,Приморский,Владивостокский,5701000,99.4,0.0,0.0,2568.5,616009.0,0.0,...,11.7,6498.0,47239.6,619351.0,430.8,31013.2,56154.0,100.0,1.0,1
4,2011,Волгоградская,Волгоград,18701000,352.1,30.01,545.1,19324.3,1020862.0,0.0,...,13.0,10444.0,25175.5,1019777.0,386.4,20260.5,85935.3,20.0,1.0,1
5,2011,Вологодская,Вологда,19701000,266.0,0.0,67.0,1070.0,310033.0,0.0,...,13.0,3994.0,35953.0,311226.0,417.0,23378.3,11573.0,0.0,1.0,1
6,2011,Воронежская,Воронеж,20701000,182.6,0.0,500.0,3500.0,979511.0,0.0,...,13.0,10191.0,31089.7,985390.0,446.23,20660.4,59651.0,0.0,1.0,1
7,2011,Свердловская,Екатеринбург,65701000,258.0,0.0,500.0,4800.0,1386242.0,0.0,...,11.5,18317.0,85296.0,1398689.0,391.77,29580.9,114289.0,0.0,1.0,1
8,2011,Удмуртская,Ижевск,94701000,44.3,64.61,600.0,2000.0,627917.0,0.0,...,11.6,0.0,31522.3,628495.0,378.2,19553.3,31515.0,1.0,1.0,1
9,2011,Татарстан,Казань,92701000,413.4,0.0,600.0,8300.0,1145435.0,0.0,...,11.9,16397.0,51890.0,1153366.0,407.79,21905.3,61416.0,100.0,1.0,1


ВТОРОЙ МАЧИНГ

In [7]:
def check_years_completeness(oktmo, df_events_full, required_years):
    municipality_data = df_events_full[df_events_full['oktmo'] == oktmo]
    existing_years = set(municipality_data['year'].unique())
    return existing_years.issuperset(required_years)


df_protests = pd.read_csv("протесты2009_2013_2010t3+.csv")
df_protests = df_protests.drop_duplicates(subset=['region_name', 'municipality'])
df_events = pd.read_csv("единицы_без_протестов_2010t3+.csv")
df_events_full = pd.read_csv("общие_данные_2010t3+.csv")

required_years = {2010.0, 2014.0, 2015.0, 2016.0}


df_events = df_events[df_events['year'].isin([2011, 2012, 2013])].reset_index(drop=True)

features = [
            'ПотреблениеЭ', 
            'ПотреблениеВ', 
            'Бюджет',
            'БезДорог', 
            'Земля',
            # 'Квартиры',  
            
            # 'Население', 
            # 'Прибыль',
            # 'Зарплата',
            # 'Расходы',

            # 'СреднегодН',
            # 'ПотреблениеТ', 
            # 'ЭконАктив'
            # 'Прирост', 
            # 'Рождаемость',
            # 'Смертность',
            # 'Родившихся',
            # 'Инвестиции',
]

df_protests = df_protests[
    ['region_name', 'municipality', 'oktmo', 'fprotets_year', "Жилье (1000 м2)", "Дороги (%)", "Канализация (1 м)",
    "Водопровод (1 метр)"] + features
].reset_index(drop=True)
df_events = df_events[
    ['region_name', 'municipality', 'oktmo', "Жилье (1000 м2)", "Дороги (%)", "Канализация (1 м)",
    "Водопровод (1 метр)"] + features
].reset_index(drop=True)


exclude_dict = {
        'Брянск': ['Брянский', 'Сельцо', 'Фокино', 'Титовское'],
    'Вологда': ['Вологодский'],
    'Воронеж': ['Хохольский', 'Семилукский', 'Рамонский', 'Новоусманский', 'Каширский'],
    'Казань': ['Лаишевский', 'Пестречинский', 'Высокогорский', 'Зеленодольский', 'Верхнеуслонский'],
    'Краснодар': ['Динский', 'Красноармейский', 'Тахтамукайский'],
    'Омск': ['Омский'],
    'Тамбов': ['Тамбовский', 'Котовск'],
    'Тула': ['Щекинский', 'Дубенский', 'Алексин', 'Ясногорский', 'Веневский', 'Киреевский', 'Ленинский'],
    'Ярославль': ['Ярославский'],
    'Владивостокский': ['Хасанский', 'Надеждинский', 'Артемовский', 'Шкотовский', 'Большой Камень', 'Фокино'],
    'Южно-Сахалинск': ['Корсаковский', 'Анивский', 'Холмский', 'Долинский'],
    'Магадан': ['Ольский', 'Хасынский'],
    'Саранск': ['Рузаевский', 'Лямбирский', 'Кочкуровский'],
    'Тюмень': ['Тюменский', 'Нижнетавдинский'],
    'Липецк': ['Грязинский', 'Липецкий', 'Добровский'],
    'Астрахань': ['Приволжский', 'Икрянинский', 'Наримановский', 'Харабали', 'Верхний'],
    'Владимир': ['Собинский', 'Судогодский', 'Камешковский', 'Суздальский'],
    'Екатеринбург': ['Первоуральск', 'Дегтярск', 'Полевской', 'Сысертский', 'Белоярский', 'Березовский', 'Верхняя Пышма'],
    'Ижевск': ['Завьяловский', 'Камское'],
    'Киров': ['Оничевский', 'Орловский', 'Юрьянский', 'Слободской', 'Кирово-Чепецкий'],
    'Курган': ['Кетовский', 'Варгашинский', 'Каргапольский'],
    'Нальчик': ['Чегемский', 'Черекский'],
    'Самара': ['Волжский', 'Новокуйбышевск'],
    'Ставрополь': ['Шпаковский'],
    'Уфа': ['Уфимский', 'Кармаскалинский', 'Иглинский', 'Чишминский'],
    'Махачкала': ['Карабудахкентский', 'Каспийск', 'Буйнакский', 'Кумторкалинский', 'Новолакский'],
    'Саратов': ['Саратовский', 'Энгельсский', 'Марксовский', 'Энгельс'],
    'Томск': ['Томский'],
    'Ульяновск': ['Ульяновский', 'Новоульяновск', 'Чердаклинский'],
    'Волгоград': ['Городищенский', 'Дубовский', 'Среднеахтубинский', 'Светлоярский', 'Калачевский'],
    'Магас': ['Назрановский', 'Пригородный'],
    'Мурманск': ['Североморск', 'Кольский'],
    'Новосибирск': ['Новосибирский', 'Кольцово', 'Бердск', 'Обь'],
    'Петрозаводский': ['Прионежский'],
    'Ростовский': ['Борисоглебский', 'Переяславь-Залесский', 'Юрьев-Польский', 'Ильинский', 'Гаврилов-Ямский'],
    'Рязань': ['Рязанский'],
    'Смоленск': ['Смоленский'],
    'Чебоксары': ['Чебоксарский', 'Новочебоксарск', 'Звениговский'],
    'Челябинский': ['Сосновский', 'Копейский', 'Красноармейский'],
    'Грозный': ['Урус-Мартановский', 'Грозненский', 'Аргун'],
    'Белгород': ['Белгородский'],
    'Иваново': ['Ивановский', 'Кохма'],
    'Пенза': ['Заречный', 'Пензенский', 'Бессоновский'],


}


excluded_pairs = set()
for protest, events in exclude_dict.items():
    for event in events:
        excluded_pairs.add((protest, event))

# Проверка на вырожденные признаки
combined_all = pd.concat([df_protests[features], df_events[features]], axis=0)
variances = combined_all.var()
threshold = 1e-8
degenerate = variances[variances <= threshold].index.tolist()
if degenerate:
    print(f"Удаляем вырожденные признаки: {degenerate}")
    features = [f for f in features if f not in degenerate]
else:
    print("Вырожденных признаков нет.")

combined_all = pd.concat([df_protests[features], df_events[features]], axis=0)
cov_matrix = np.cov(combined_all.T)
cov_inv = np.linalg.inv(cov_matrix)

# Matching внутри каждого региона с усиленной проверкой ограничений
matches_list = []

for region in df_protests['region_name'].unique():
    sub_protests = df_protests[df_protests['region_name'] == region].reset_index(drop=True)
    sub_events = df_events[df_events['region_name'] == region].reset_index(drop=True)
    
    # Фильтрация событий: полнота данных + проверка на исключения
    valid_events = []
    for _, event_row in sub_events.iterrows():
        event_name = event_row['municipality']
        oktmo = event_row['oktmo']
        
        # Проверка полноты данных
        has_full_data = check_years_completeness(oktmo, df_events_full, required_years)
        
        # Проверка, что нет ни одного протеста, для которого это событие запрещено
        is_allowed = all(
            (protest_name, event_name) not in excluded_pairs
            for protest_name in sub_protests['municipality']
        )
        
        if has_full_data and is_allowed:
            valid_events.append(event_row)
    
    if not valid_events:
        print(f"В регионе {region} нет подходящих муниципалитетов событий")
        continue
    
    sub_events_valid = pd.DataFrame(valid_events).reset_index(drop=True)
    
    n_p = sub_protests.shape[0]
    n_e = sub_events_valid.shape[0]
    
    if n_e == 0:
        continue
    
    dist_mat = np.zeros((n_p, n_e))
    
    # Заполнение матрицы расстояний
    for i in range(n_p):
        x = sub_protests.loc[i, features].values
        for j in range(n_e):
            y = sub_events_valid.loc[j, features].values
            dist_mat[i, j] = mahalanobis(x, y, cov_inv)
    
    # Венгерский алгоритм
    row_ind, col_ind = linear_sum_assignment(dist_mat)
    
    # Сбор результатов
    matched_p = sub_protests.iloc[row_ind].reset_index(drop=True)
    matched_e = sub_events_valid.iloc[col_ind].reset_index(drop=True)
    
    matched = matched_p.join(
        matched_e,
        lsuffix="_protest",
        rsuffix="_event"
    )
    matched["mahalanobis_distance"] = dist_mat[row_ind, col_ind]
    matches_list.append(matched)




df_matches = pd.concat(matches_list, axis=0).reset_index(drop=True)
df_matches.to_csv('прототип_данных_2010t3+.csv', index=False)
df_matches

Вырожденных признаков нет.
В регионе Чеченская нет подходящих муниципалитетов событий
В регионе Магаданская нет подходящих муниципалитетов событий


Unnamed: 0,region_name_protest,municipality_protest,oktmo_protest,fprotets_year,Жилье (1000 м2)_protest,Дороги (%)_protest,Канализация (1 м)_protest,Водопровод (1 метр)_protest,ПотреблениеЭ_protest,ПотреблениеВ_protest,...,Жилье (1000 м2)_event,Дороги (%)_event,Канализация (1 м)_event,Водопровод (1 метр)_event,ПотреблениеЭ_event,ПотреблениеВ_event,Бюджет_event,БезДорог_event,Земля_event,mahalanobis_distance
0,Архангельская,Архангельск,11701000,2011,538.2,2.94,2400.0,5900.0,746.50,40.32,...,149.7,15.00,129.9,536.2,699.57,29.04,-497130.0,0.00,119349.0,1.525916
1,Астраханская,Астрахань,12701000,2011,1655.9,38.00,2000.0,4000.0,778.21,24.03,...,14.0,1.01,300.0,5670.0,736.77,13.78,-2964.0,0.90,781134.0,4.010645
2,Алтайский,Барнаул,1701000,2011,86.0,0.00,100.0,5400.0,949.00,24.00,...,15.2,39.98,0.0,100.0,830.00,22.50,46432.0,0.00,7916.0,0.311654
3,Белгородская,Белгород,14701000,2011,139.1,0.00,0.0,100.0,882.27,22.77,...,47.5,98.10,0.0,560.0,1069.75,23.90,-129784.0,0.24,169345.0,0.529165
4,Амурская,Благовещенск,10701000,2011,96.4,0.00,600.0,500.0,1319.25,40.00,...,197.8,0.00,0.0,3200.0,1356.80,41.97,-129674.0,0.00,22482.0,11.551934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,Смоленская,Смоленск,66701000,2012,48.5,0.00,600.0,2300.0,786.69,37.10,...,49.2,74.66,2300.0,3650.0,642.32,33.10,-90416.9,6.40,333790.0,6.730255
57,Липецкая,Липецк,42701000,2013,68.2,48.29,100.0,1000.0,973.87,17.03,...,14.9,0.00,0.0,0.0,0.00,0.00,-154148.8,0.00,1798.0,3.554236
58,Ингушетия,Магас,26701000,2013,0.0,0.00,0.0,500.0,0.00,0.00,...,10.8,0.00,0.0,0.0,0.00,0.00,6690.5,0.00,5474.0,0.011254
59,Адыгея,Майкоп,79701000,2013,2.5,0.00,0.0,2740.0,651.00,20.00,...,5.1,0.00,0.0,600.0,699.00,15.50,-42193.0,0.01,366743.0,2.882972


In [8]:
df_2012 = pd.read_csv('прототип_данных_2010t3+.csv')
df_2011 = pd.read_csv('без_протестов2009_2010t3+.csv')

df_2012 = df_2012[df_2012["mahalanobis_distance"] <= 10].reset_index(drop=True)

merged_df = pd.merge(
    df_2012,
    df_2011,
    left_on=['municipality_protest', 'oktmo_protest'],
    right_on=['municipality', 'oktmo'],
    how='inner'
)

from2012 = merged_df[[
    'municipality_protest', 'oktmo_protest',
    'Жилье (1000 м2)_protest', 'Дороги (%)_protest',
    'Канализация (1 м)_protest', 'Водопровод (1 метр)_protest',
    'municipality_event', 'oktmo_event',
    'Жилье (1000 м2)_event', 'Дороги (%)_event',
    'Канализация (1 м)_event', 'Водопровод (1 метр)_event', 
    'region_name_protest', 
    'region_name_event', 'fprotets_year'
]]


from2012.to_csv('связка_махаланобис_2010t3+.csv', index=False)
from2012

Unnamed: 0,municipality_protest,oktmo_protest,Жилье (1000 м2)_protest,Дороги (%)_protest,Канализация (1 м)_protest,Водопровод (1 метр)_protest,municipality_event,oktmo_event,Жилье (1000 м2)_event,Дороги (%)_event,Канализация (1 м)_event,Водопровод (1 метр)_event,region_name_protest,region_name_event,fprotets_year
0,Астрахань,12701000,1655.9,38.0,2000.0,4000.0,Ахтубинский,12605000,14.0,1.01,300.0,5670.0,Астраханская,Астраханская,2011
1,Белгород,14701000,139.1,0.0,0.0,100.0,Старооскольский,14740000,47.5,98.1,0.0,560.0,Белгородская,Белгородская,2011
2,Брянск,15701000,98.2,0.0,1200.0,2500.0,Клинцы,15715000,28.7,48.7,77.0,271.0,Брянская,Брянская,2011
3,Волгоград,18701000,352.1,30.01,545.1,19324.3,Котовский,18626000,2.8,21.76,0.0,2665.0,Волгоградская,Волгоградская,2011
4,Вологда,19701000,266.0,0.0,67.0,1070.0,Череповец,19730000,6.3,0.0,400.0,0.0,Вологодская,Вологодская,2011
5,Екатеринбург,65701000,258.0,0.0,500.0,4800.0,Талицкий,65724000,28.7,32.7,0.0,1099.0,Свердловская,Свердловская,2011
6,Ижевск,94701000,44.3,64.61,600.0,2000.0,Воткинск,94710000,31.8,8.28,118.0,890.0,Удмуртская,Удмуртская,2011
7,Киров,33701000,116.8,10.68,2600.0,5500.0,Котельничский,33619000,79.1,25.54,115.0,5516.0,Кировская,Кировская,2011
8,Краснодар,3701000,197.8,0.0,582.0,7405.0,Славянский,3645000,9.5,0.0,530.0,15506.0,Краснодарский,Краснодарский,2011
9,Курган,37701000,89.2,0.0,520.0,7100.0,Далматовский,37608000,134.5,47.76,0.0,3300.0,Курганская,Курганская,2011


In [9]:
df = pd.read_csv('общие_данные_2010t3+.csv')

df = df[['year', 'municipality', 'oktmo', 'Жилье (1000 м2)', 'Дороги (%)', 'Канализация (1 м)', 'Водопровод (1 метр)',  'region_name',
        ]]
df = df.drop_duplicates(subset=['year', 'municipality', 'oktmo'])


df = df[df['year'].isin([2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016])]
events = df
from2012 = pd.read_csv('связка_махаланобис_2010t3+.csv')


final = pd.DataFrame()
for _, link_row in from2012.iterrows():
    

    protest_filter = (
        (events['municipality'] == link_row['municipality_protest']) &
        (events['oktmo'] == link_row['oktmo_protest'])
    )
    protest_data = events[protest_filter].add_suffix('_protest')
    

    event_filter = (
        (events['municipality'] == link_row['municipality_event']) &
        (events['oktmo'] == link_row['oktmo_event'])
    )
    event_data = events[event_filter].add_suffix('_event')
    

    merged = protest_data.merge(
        event_data,
        left_on='year_protest',
        right_on='year_event',
        how='left'
    ).rename(columns={'year_protest': 'year'})
    

    merged = merged.drop('year_event', axis=1)
    merged['fprotets_year'] = link_row.get('fprotets_year', pd.NA)
    merged.loc[~pd.to_numeric(merged['year'], errors='coerce').astype('Int64').isin([2011, 2012, 2013]),
               'fprotets_year'] = pd.NA
    final = pd.concat([final, merged], ignore_index=True)


final.to_csv('2010t3+.csv', index=False)
final


Unnamed: 0,year,municipality_protest,oktmo_protest,Жилье (1000 м2)_protest,Дороги (%)_protest,Канализация (1 м)_protest,Водопровод (1 метр)_protest,region_name_protest,municipality_event,oktmo_event,Жилье (1000 м2)_event,Дороги (%)_event,Канализация (1 м)_event,Водопровод (1 метр)_event,region_name_event,fprotets_year
0,2008,Астрахань,12701000,1762.1,6.39,420.0,6906.0,Астраханская,Ахтубинский,12605000,27.40,9.30,1010.0,1080.0,Астраханская,
1,2009,Астрахань,12701000,1739.0,1.08,8230.0,1200.0,Астраханская,Ахтубинский,12605000,27.75,3.85,0.0,2775.0,Астраханская,
2,2010,Астрахань,12701000,1672.9,91.15,6600.0,8800.0,Астраханская,Ахтубинский,12605000,14.00,0.00,1530.0,14640.0,Астраханская,
3,2011,Астрахань,12701000,1655.9,38.00,2000.0,4000.0,Астраханская,Ахтубинский,12605000,14.00,1.01,300.0,5670.0,Астраханская,2011.0
4,2012,Астрахань,12701000,1647.4,38.00,6290.0,18033.0,Астраханская,Ахтубинский,12605000,16.00,3.20,0.0,3895.0,Астраханская,2011.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308,2012,Нальчик,83701000,62.4,63.81,466.0,17615.0,Кабардино-Балкарская,Эльбрусский,83648000,61.90,30.27,200.0,2059.0,Кабардино-Балкарская,2013.0
309,2013,Нальчик,83701000,62.4,63.81,0.0,785.0,Кабардино-Балкарская,Эльбрусский,83648000,77.20,30.27,0.0,26149.0,Кабардино-Балкарская,2013.0
310,2014,Нальчик,83701000,62.4,66.75,913.0,1688.0,Кабардино-Балкарская,Эльбрусский,83648000,72.90,32.94,700.0,649.0,Кабардино-Балкарская,
311,2015,Нальчик,83701000,0.0,66.94,1000.0,900.0,Кабардино-Балкарская,Эльбрусский,83648000,0.00,53.01,1200.0,12000.0,Кабардино-Балкарская,
