In [1]:
import pandas as pd
import re
import numpy as np
import random
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import time

In [4]:
df = pd.read_csv('pii_data_large.csv', low_memory=False)

In [5]:
df.shape

(100000, 31)

In [6]:
df.head(3)

Unnamed: 0,family_name,name,surname,age,birthday,subject,region,city,street,house,...,job,salary,dept,direct,sect,spec,highshool,mil_rank,mil_spec,comment
0,Калинина,Гедеон,Тарасовна,23,2021-05-12,Тульская область,Якутский район,г. Пермь,пер. Пионерская,3,...,Аналитик,60000,Департамент внезапных озарений,Управление критики практического разума,Отдел маскирования персональных данных,Парапсихология,Морская техническая академия,лейтенант,матрос подводной лодки,['Отдел дорогой руководитель полюбить коммуниз...
1,Комиссаров,Добромысл,Адамович,22,1965-06-05,Республика Карелия,Красноярский район,г. Нарьян-Мар,бул. Слободская,18/13,...,Медицинская сестра,110000,Департамент бескомпромиссной борьбы,Управление № 1,Отдел позитивной критики,Контрразведывательная деятельность,Крымский федеральный университет,старший сержант,артиллерист,['Магазин мусор о мрачно. Премьера означать пр...
2,Калашников,Филипп,Абрамович,55,2014-04-27,Кабардино-Балкарская Республика,Йошкар-олинский район,г. Тамбов,ш. Осенняя,116,...,Кредитный специалист,90000,Департамент глубокомысленных размышлений,Управление № 2,Отдел изучения эфирных оболочек,Прикладная математика,Таймырская сельскохозяйственная академия,капитан 2-го ранга,фельдшер,['Порт социалистический отражение применяться....


### Извлечение признаков

In [7]:
def extract_features_from_column(column):
    if column.dtype == 'object' or np.issubdtype(column.dtype, np.datetime64):  # Тип объект или дата
        column = column.astype('string')
        avg_length = column.apply(len).mean()
        avg_num_words = column.apply(lambda x: len(x.split())).mean()
        most_freq_cnt = column.value_counts().iloc[0]
        unique_cnt = len(column.unique())
        avg_num_letters = column.apply(lambda x: sum(c.isalpha() for c in x)).mean()
        avg_num_digits = column.apply(lambda x: sum(c.isdigit() for c in x)).mean()
        avg_num_dashs = column.apply(lambda x: sum(c == '-' for c in x)).mean()
        avg_num_ats = column.apply(lambda x: sum(c == '@' for c in x)).mean()
        avg_num_dots = column.apply(lambda x: sum(c == '.' for c in x)).mean()
        avg_num_sp_chars = column.apply(lambda x: sum(not c.isalnum() for c in x)).mean()
        avg_num_upper = column.apply(lambda x: sum(c.isupper() for c in x)).mean()
        mean_value = std_value = min_value = max_value = quantile_0_2 = quantile_0_8 = 0
        
        return [avg_length, avg_num_words, most_freq_cnt, unique_cnt, avg_num_letters, avg_num_digits, avg_num_dashs, avg_num_ats, avg_num_dots, 
                avg_num_sp_chars, avg_num_upper, mean_value, std_value, min_value, max_value, quantile_0_2, quantile_0_8]
    elif np.issubdtype(column.dtype, np.number):  # Численный тип
        mean_value = column.mean()
        std_value = column.std()
        min_value = column.min()
        max_value = column.max()
        quantile_0_2 = column.quantile(0.2)
        quantile_0_8 = column.quantile(0.8)
        most_freq_cnt = column.value_counts().iloc[0]
        unique_cnt = len(column.unique())
        avg_length = avg_num_words = avg_min_word_len = avg_num_letters = avg_num_digits = avg_num_dashs = avg_num_ats = avg_num_dots = avg_num_sp_chars = avg_num_upper = 0
        return [avg_length, avg_num_words, most_freq_cnt, unique_cnt, avg_num_letters, avg_num_digits, avg_num_dashs, avg_num_ats, avg_num_dots, 
                avg_num_sp_chars, avg_num_upper, mean_value, std_value, min_value, max_value, quantile_0_2, quantile_0_8]
    else:
        return [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]

In [27]:
# Создание датфрейма из признаков с классами
feature_list = []
column_names = df.columns
num_iterations = 1000 # Количество итераций
num_samples = random.randint(500, 10000) # Количество сэмплов

for _ in range(num_iterations):
    for column_name in column_names:
        features = extract_features_from_column(df.sample(random.randint(500, 10000))[column_name])
        feature_list.append(features + [column_name,])

feature_df = pd.DataFrame(feature_list, columns=[
    'avg_length', 'avg_num_words', 'most_freq_cnt', 'unique_cnt', 'avg_num_letters', 'avg_num_digits', 'avg_num_dashs', 'avg_num_ats', 'avg_num_dots', 
    'avg_num_sp_chars', 'avg_num_upper', 'mean_value', 'std_value', 'min_value', 'max_value', 'quantile_0_2', 'quantile_0_8', 'column_name'
    ])

In [16]:
feature_df.to_csv('feature_df_large.csv', index=False)

In [17]:
feature_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31000 entries, 0 to 30999
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   avg_length        31000 non-null  float64
 1   avg_num_words     31000 non-null  float64
 2   most_freq_cnt     31000 non-null  int64  
 3   unique_cnt        31000 non-null  int64  
 4   avg_num_letters   31000 non-null  float64
 5   avg_num_digits    31000 non-null  float64
 6   avg_num_dashs     31000 non-null  float64
 7   avg_num_ats       31000 non-null  float64
 8   avg_num_dots      31000 non-null  float64
 9   avg_num_sp_chars  31000 non-null  float64
 10  avg_num_upper     31000 non-null  float64
 11  mean_value        31000 non-null  float64
 12  std_value         31000 non-null  float64
 13  min_value         31000 non-null  int64  
 14  max_value         31000 non-null  int64  
 15  quantile_0_2      31000 non-null  float64
 16  quantile_0_8      31000 non-null  float6

### Обучение модели

In [32]:
label_encoder = LabelEncoder()
feature_df['target_encoded'] = label_encoder.fit_transform(feature_df['column_name'])

X = feature_df.drop(columns=['column_name', 'target_encoded'])
y = feature_df['target_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Выбор гиперпараметров
# param_grid = {
#     'device': 'gpu',
#     'n_estimators': [10, 50, 100],
#     'max_depth': [1, 3, 5],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.6, 0.8, 1.0],
#     'colsample_bytree': [0.6, 0.8, 1.0]
# }

# gbm = lgb.LGBMClassifier(device='gpu', random_state=42)

# Поиск гиперпараметров
# grid_search = GridSearchCV(estimator=gbm, param_grid=param_grid, cv=5, scoring='accuracy')#, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# Выбор гиперпараметров
# best_params_gbm = grid_search.best_params_
# print("Best parameters for GBM:", best_params_gbm)

# Тренировка модели с выбранными гиперпараметрами
# best_gbm = lgb.LGBMClassifier(device='gpu', **best_params_gbm, random_state=42)
# best_gbm.fit(X_train, y_train)

params = {
#     'device': 'gpu',
    'colsample_bytree': 0.6,
    'learning_rate': 0.01,
    'max_depth': 3,
    'n_estimators': 50,
    'subsample': 0.6,
    'random_state': 42
    }

# Тренировка модели с выбранными гиперпараметрами
best_gbm = lgb.LGBMClassifier(**params)
best_gbm.fit(X_train, y_train)

In [19]:
# Точность предсказаний на тестовой выборке
y_pred = best_gbm.predict(X_test)
print('LightGBM accuracy score: {0:0.4f}'.format(accuracy_score(y_pred, y_test)))

LightGBM accuracy score: 0.9997


In [13]:
df_val = pd.read_csv('pii_data.csv', low_memory=False)

### Предсказания на валидационном датасете

In [28]:
feature_list_val = []
column_names = df.columns
for column_name in column_names:
    features = extract_features_from_column(df_val.sample(1000)[column_name])
    feature_list_val.append(features + [column_name,])

feature_df_val = pd.DataFrame(feature_list_val, columns=[
    'avg_length', 'avg_num_words', 'most_freq_cnt', 'unique_cnt', 'avg_num_letters', 'avg_num_digits', 'avg_num_dashs', 'avg_num_ats', 'avg_num_dots', 
    'avg_num_sp_chars', 'avg_num_upper', 'mean_value', 'std_value', 'min_value', 'max_value', 'quantile_0_2', 'quantile_0_8', 'column_name'
    ])

In [35]:
predictions = best_gbm.predict(feature_df_val.drop('column_name', axis=1))
le_mapping = dict(zip(label_encoder.transform( label_encoder.classes_), label_encoder.classes_))

print('Поля:   Классы')
print(f"{'-' * 14}")
for i in range(len(feature_df_val.columns)):
    print(f"{df_val.columns[i]}:   {le_mapping[predictions[i]]}")

Поля:   Классы
--------------
family_name:   family_name
name:   name
surname:   surname
age:   age
birthday:   birthday
subject:   subject
region:   region
city:   city
street:   street
house:   house
flat:   flat
diploma_seria:   diploma_seria
diploma_num:   diploma_num
mil_seria:   mil_seria
mil_num:   mil_num
snils:   snils
inn:   inn
passport:   passport
