In [None]:
import pandas as pd
import geopandas as gpd
import re  
import fiona


In [None]:
with fiona.open(r"Z:\landuse.shp", encoding="utf-8") as shapefile:
    gdf = gpd.GeoDataFrame.from_features(shapefile)
    

In [None]:
gdf['group1'] = ''
gdf['id_group1'] = ''
gdf['match'] = ''
gdf['position'] = ''
gdf['group2'] = ''
gdf['id_group2'] = ''

gdf.head(5)


In [None]:
gdf.info()


In [None]:
gdf.tsn_ru.value_counts()


In [None]:
gdf.kad_num.value_counts()


In [None]:
id = {
    'Сельскохозяйственные': 1,
    'Жилые': 2,
    'Общественно-деловые': 3,
    'Промышленные': 4,
    'Охраняемые и памятники': 5,
    'Рекреационные': 6,
    'Режимные объекты': 7,
    'Специального назначения': 8,
    'Транспорт': 9,
    'Инженерные и коммунально-складские': 10,
    'Образование': 11,
    'Прочие': 12,
}


#### Фильтрация по ключевым словам

In [None]:
category_patterns = {
    'Сельскохозяйственные': re.compile(r'\b()\w*\b', re.IGNORECASE),
    'Жилые': re.compile(r'\b()\w*\b', re.IGNORECASE),
    'Общественно-деловые': re.compile(r'\b()\w*\b', re.IGNORECASE),
    'Промышленные': re.compile(r'\b()\w*\b', re.IGNORECASE),
    'Охраняемые и памятники': re.compile(r'\b()\w*\b', re.IGNORECASE),
    'Рекреационные': re.compile(r'\b()\b', re.IGNORECASE),
    'Режимные объекты': re.compile(r'\b()\w*\b', re.IGNORECASE),
    'Специального назначения': re.compile(r'\b()\w*\b', re.IGNORECASE),
    'Транспорт': re.compile(r'\b()\w*\b'),
    'Инженерные и коммунально-складские': re.compile(r'\b()\w*\b'),
    'Образование': re.compile(r'\b()\w*\b', re.IGNORECASE),
    'Прочие': re.compile(r'\b()\w*\b'),
}

In [None]:
for column in gdf.columns:
    if column == 'geometry':
        continue
    else:    
        gdf[column] = gdf[column].astype(str)

#### Первое слово дороже второго

In [None]:
for idx, line in gdf.iterrows():
    matches = []
    for category, pattern in category_patterns.items():
        match = pattern.search(line['tsn_ru'])
        if match:
            matches.append((category, match.group(), match.start()))
    
    if matches:
        
        # Заполняем первую категорию и ID
        gdf.at[idx, 'group1'] = matches[0][0]
        gdf.at[idx, 'id_group1'] = str(id[matches[0][0]])
        gdf.at[idx, 'match'] = matches[0][1]
        gdf.at[idx, 'position'] = str(matches[0][2])
        gdf.at[idx, 'group1'] = matches[0][0]
        gdf.at[idx, 'id_group1'] = str(id[matches[0][0]])
        
        # Если есть другие совпадения, добавляем их через запятую
        for match in matches[1:]:
            gdf.at[idx, 'group1'] += f', {match[0]}'
            gdf.at[idx, 'id_group1'] += f', {id[match[0]]}'
            gdf.at[idx, 'match'] += f', {match[1]}'
            gdf.at[idx, 'position'] += f', {match[2]}'

        matches.sort(key=lambda x: x[2])

        gdf.at[idx, 'group2'] = matches[0][0]
        gdf.at[idx, 'id_group2'] = str(id[matches[0][0]])


#### Отладка

In [None]:
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt

In [None]:
gdf['id_group2'].value_counts()

In [None]:
counts = gdf['id_group2'].value_counts()

plt.figure(figsize=(4, 4))
plt.pie(counts, startangle=90, labeldistance=1.1)
plt.legend(counts.index, loc='right', bbox_to_anchor=(0,0.5,1.3,0))
plt.show()

##### 1. Сельскохозяйственные

In [None]:
len(gdf[gdf['id_group2'] == '1'])

In [None]:
agricult = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '1'].tsn_ru.unique())):
    agricult.append(j)
    print(i,j)

In [None]:
agricult_simplified = []

for k in agricult:
    is_similar = False
    for n in agricult_simplified:
        if fuzz.ratio(k, n) > 90:
            is_similar = True
            break
    if not is_similar:
        agricult_simplified.append(k)

for x, y in enumerate(agricult_simplified):
    print(x,y)

##### 2. Жилые

In [None]:
len(gdf[gdf['id_group2'] == '2'])

In [None]:
jil = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '2'].tsn_ru.unique())):
    jil.append(j)
    print(i,j)

In [None]:
jil_simplified = []

for k in jil:
    is_similar = False
    for n in jil_simplified:
        if fuzz.ratio(k, n) > 60:
            is_similar = True
            break
    if not is_similar:
        jil_simplified.append(k)

for x, y in enumerate(jil_simplified):
    print(x,y)


##### 3. Общественно-деловые

In [None]:
len(gdf[gdf['id_group2'] == '3'])

In [None]:
odz = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '3'].tsn_ru.unique())):
    odz.append(j)
    print(i,j)

In [None]:
odz_simplified = []

for k in odz:
    is_similar = False
    for n in odz_simplified:
        if fuzz.ratio(k, n) > 45:
            is_similar = True
            break
    if not is_similar:
        odz_simplified.append(k)

for x, y in enumerate(odz_simplified):
    print(x,y)

#7

##### 4. Промышленные

In [None]:
len(gdf[gdf['id_group2'] == '4'])

In [None]:
prom = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '4'].tsn_ru.unique())):
    prom.append(j)
    print(i,j)


In [None]:
prom_simplified = []

for k in prom:
    is_similar = False
    for n in prom_simplified:
        if fuzz.ratio(k, n) > 65:
            is_similar = True
            break
    if not is_similar:
        prom_simplified.append(k)

for x, y in enumerate(prom_simplified):
    print(x,y)
    

##### 5. Охраняемые и памятники

In [None]:
len(gdf[gdf['id_group2'] == '5'])

In [None]:
protect = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '5'].tsn_ru.unique())):
    protect.append(j)
    print(i,j)


#### 6. Рекреационные

In [None]:
len(gdf[gdf['id_group2'] == '6'])

In [None]:
rec = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '6'].tsn_ru.unique())):
    rec.append(j)
    print(i,j)


# 108
# 109

#### 7. Режимные объекты

In [None]:
len(gdf[gdf['id_group2'] == '7'])

In [None]:
restrict = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '7'].tsn_ru.unique())):
    restrict.append(j)
    print(i,j)


#### 8. Специального назначения

In [None]:
len(gdf[gdf['id_group2'] == '8'])

In [None]:
spec = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '8'].tsn_ru.unique())):
    spec.append(j)
    print(i,j)


#### 9. Транспорт

In [None]:
len(gdf[gdf['id_group2'] == '9'])

In [None]:
transport = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '9'].tsn_ru.unique())):
    transport.append(j)
    print(i,j)


In [None]:
transport_simplified = []

for k in transport:
    is_similar = False
    for n in transport_simplified:
        if fuzz.ratio(k, n) > 60:
            is_similar = True
            break
    if not is_similar:
        transport_simplified.append(k)

for x, y in enumerate(transport_simplified):
    print(x,y)
    

#### 10. Инженерные и коммунально-складские

In [None]:
len(gdf[gdf['id_group2'] == '10'])

In [None]:
komm = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '10'].tsn_ru.unique())):
    komm.append(j)
    print(i,j)


In [None]:
komm_simplified = []

for k in komm:
    is_similar = False
    for n in komm_simplified:
        if fuzz.ratio(k, n) > 60:
            is_similar = True
            break
    if not is_similar:
        komm_simplified.append(k)

for x, y in enumerate(komm_simplified):
    print(x,y)
    

#### 11. Образование

In [None]:
len(gdf[gdf['id_group2'] == '11'])

In [None]:
edu = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '11'].tsn_ru.unique())):
    edu.append(j)
    print(i,j)


In [None]:
edu_simplified = []

for k in edu:
    is_similar = False
    for n in edu_simplified:
        if fuzz.ratio(k, n) > 70:
            is_similar = True
            break
    if not is_similar:
        edu_simplified.append(k)

for x, y in enumerate(edu_simplified):
    print(x,y)
    

#### 12. Прочие

In [None]:
len(gdf[gdf['id_group2'] == '12'])

In [None]:
other = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == '12'].tsn_ru.unique())):
    other.append(j)
    print(i,j)


In [None]:
other_simplified = []

for k in other:
    is_similar = False
    for n in other_simplified:
        if fuzz.ratio(k, n) > 70:
            is_similar = True
            break
    if not is_similar:
        other_simplified.append(k)

for x, y in enumerate(other_simplified):
    print(x,y)
    

#### Пустые

In [None]:
len(gdf[gdf['id_group2'] == ""])

In [None]:
empty = []

for i,j in enumerate(sorted(gdf[gdf['id_group2'] == ''].tsn_ru.unique())):
    empty.append(j)
    print(i,j)


In [None]:
gdf.to_excel(r"C:\Users\Dmitrii.L\Desktop\output.xlsx", index=False)


In [None]:
gdf.to_file(r"C:\Users\Dmitrii.L\Desktop\output.shp")
