In [1]:
import pandas as pd
# import statistics
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [2]:
pii_df = pd.read_csv('pii_data.csv')

In [3]:
pii_df

Unnamed: 0,name,phone_number,email,address,credit_card,security_code,date
0,Денисов Эраст Харитонович,+7 (362) 868-30-78,rjurik45@oao.info,"д. Соловки, наб. Садовая, д. 1 стр. 5, 042438",4905833260703139,510,2000-01-12
1,Анжелика Александровна Устинова,+7 393 197 12 58,ignatevplaton@beljaeva.org,"с. Плесецк, наб. Льва Толстого, д. 5 к. 9, 376982",373162067657061,998,1971-09-05
2,Савин Ростислав Гурьевич,+7 (540) 892-63-54,pankrat1984@oao.net,"п. Токсово, ул. Серафимовича, д. 7/9, 598085",2705238075699439,267,2008-10-26
3,Афиноген Арсеньевич Лыткин,+79546655802,lukinonufri@kontsern.org,"г. Советский, пер. Выгонный, д. 9/1 к. 53, 584654",2201044605594218,918,1997-05-06
4,Лавр Геннадиевич Шашков,+70131872460,krjukovfeoktist@beljakov.info,"с. Серов, ул. Димитрова, д. 77 стр. 177, 597162",2203382385517001,351,1984-08-26
...,...,...,...,...,...,...,...
3995,Лев Сысоев,8 048 805 52 63,frolovereme@yahoo.com,"с. Кинешма, наб. Садовая, д. 5 стр. 66, 006791",3410 7445 1777 448,653,"October 16, 2011"
3996,Агата Савин,8 (174) 753-3442,qborisova@ip.com,"п. Минусинск, ул. Просторная, д. 4 к. 1, 418201",5749 9284 2622 2755,95,"January 21, 1980"
3997,Сила Соколова,+77665082628,kabanovevgeni@russkaja.net,"г. Арзамас, ул. Осипенко, д. 85 к. 972, 385384",5626 2342 5351 4150,333,"February 22, 1992"
3998,Геннадий Дорофеев,+7 (602) 963-52-51,varvara_2017@oao.net,"ст. Черкесск, наб. Молодежная, д. 5 к. 974, 70...",6219 7822 9247 7830,819,"May 14, 1970"


In [4]:
cols = pii_df.columns.to_list()
pii_df[cols] = pii_df[cols].astype('string')

In [5]:
def counter(s: str):
    length = len(''.join(s.split()))
    spaces = letters = digits = ats = others = 0
    for el in s:
        if el.isspace():
            spaces += 1
        elif el.isalpha():
            letters += 1
        elif el.isdigit():
            digits += 1
        elif el == '@':
            ats += 1
        else:
            others += 1
    return length, spaces, letters, digits, ats, others

In [6]:
classes = pii_df.columns.to_list()

stats_dict = {'length': [], 
              'space': [], 
              'letter': [], 
              'digit': [], 
              'at': [], 
              'other': [],
              'category': []
             }
n = pii_df.shape[0]

for col in classes:
    stats_dict['length'].extend([stat[0] for stat in [counter(el) for el in pii_df[col]]])
    stats_dict['space'].extend([stat[1] for stat in [counter(el) for el in pii_df[col]]])
    stats_dict['letter'].extend([stat[2] for stat in [counter(el) for el in pii_df[col]]])
    stats_dict['digit'].extend([stat[3] for stat in [counter(el) for el in pii_df[col]]])
    stats_dict['at'].extend([stat[4] for stat in [counter(el) for el in pii_df[col]]])
    stats_dict['other'].extend([stat[5] for stat in [counter(el) for el in pii_df[col]]])
    stats_dict['category'].extend([classes.index(col) for _ in range(n)])
    

In [7]:
stats_df = pd.DataFrame(stats_dict)

In [8]:
stats_df

Unnamed: 0,length,space,letter,digit,at,other,category
0,23,2,23,0,0,0,0
1,29,2,29,0,0,0,0
2,22,2,22,0,0,0,0
3,24,2,24,0,0,0,0
4,21,2,21,0,0,0,0
...,...,...,...,...,...,...,...
27995,14,2,7,6,0,1,6
27996,14,2,7,6,0,1,6
27997,15,2,8,6,0,1,6
27998,10,2,3,6,0,1,6


In [9]:
data_cols = stats_df.columns[:-1]

In [10]:
x, y = stats_df[data_cols], stats_df['category']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2)

In [11]:
xgbc = XGBClassifier()
xgbc.fit(x_train, y_train)

score = xgbc.score(x_train, y_train)
print("Train score: ", score)

Train score:  1.0


In [12]:
cv_score = cross_val_score(xgbc, x_train, y_train, cv=10)
print("CV mean score: ", cv_score.mean())

y_pred = xgbc.predict(x_test)
cr = classification_report(y_test, y_pred)
print(cr)

CV mean score:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       841
           1       1.00      1.00      1.00       788
           2       1.00      1.00      1.00       776
           3       1.00      1.00      1.00       771
           4       1.00      1.00      1.00       822
           5       1.00      1.00      1.00       773
           6       1.00      1.00      1.00       829

    accuracy                           1.00      5600
   macro avg       1.00      1.00      1.00      5600
weighted avg       1.00      1.00      1.00      5600

