In [385]:
import pandas as pd
import enchant
import difflib
import re
import string
import random

In [386]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [387]:
df = pd.read_csv('task_file.txt', sep=',', index_col=False).astype(str)

  df = pd.read_csv('task_file.txt', sep=',', index_col=False).astype(str)


In [388]:
df.columns

Index(['EMAIL', 'NAME', 'LAST_NAME', 'TEL', 'CITY'], dtype='object')

In [389]:
df

Unnamed: 0,EMAIL,NAME,LAST_NAME,TEL,CITY
0,,Ivan,Abramov,7776514,Moscow
1,,Alexey,Nikolaev,1348520,Moscow
2,,Marina,Shapar,11311110,Moscow
3,,Dmitriy,Vasilyev,0000000,Kazan
4,,Ekaterina,Ilyina,,St.Petersburg
5,,Anastasiya,Grigoryan,1928421,Ekaterinburg
6,,Andrey,Fedorov,85212384,Minsk
7,,Alexey,Lisitsyn,1239532,Tver
8,,Dariya,Abramova,7163908,Moscow
9,,Alexandr,Evdokimov,482,Volokolamsk


In [390]:
class Corrector:
    def __init__(self, cities_path='russian_cities.txt', names_path='russian_names.txt'):
        self.cities_dict = enchant.PyPWL(cities_path)
        self.names_dict = enchant.PyPWL(names_path)
        
       
    def correct(self, woi, key, tresh_hold=0.6):
        
        if key == 'NAME':
            dictionary = self.names_dict
        elif key == 'CITY':
            dictionary = self.cities_dict
        elif key == 'LAST_NAME':
            return self.correct_last_name(woi)
        elif key == 'TEL':
            return self.correct_phone(woi)
        else:
            raise TypeError("wrong key name, only city/name")
            
        init_word = woi
        woi = ''.join(filter(str.isalpha, woi)).capitalize()
        sim = dict()

        suggestions = set(dictionary.suggest(woi))
        
        for word in suggestions:
            measure = difflib.SequenceMatcher(None, woi, word).ratio()
            sim[measure] = word

        
        if not suggestions or woi == 'Nan':
            return 1, init_word
        
        best_sim = max(sim.keys())
        
        if best_sim < tresh_hold:
            return 1, init_word
        else:
            return 0, sim[best_sim].capitalize()
        
        
    def correct_last_name(self, last_name): #TODO
        f_last_nname = ''.join(filter(str.isalpha, last_name)).capitalize()
        if f_last_nname == 'Nan':
            return 1, last_name
        
        else:
            return 0, f_last_nname
        
        
    def correct_phone(self, number):
        f_number = ''.join(filter(str.isnumeric, str(number)))
        if len(f_number) != 7:
            return 1, number
        else:
            return 0, f_number

In [391]:
corrector = Corrector()

In [392]:
df["WRONG_NAME"] = df["NAME"].apply(lambda x: corrector.correct(x, "NAME"))

df["NAME"] = df["WRONG_NAME"].apply(lambda x: x[1])
df["WRONG"] = df["WRONG_NAME"].apply(lambda x: x[0])

In [393]:
df["WRONG_LAST_NAME"] = df["LAST_NAME"].apply(lambda x: corrector.correct(x, "LAST_NAME"))

df["LAST_NAME"] = df["WRONG_LAST_NAME"].apply(lambda x: x[1])
df["WRONG"] += df["WRONG_LAST_NAME"].apply(lambda x: x[0])

In [394]:
df["WRONG_TEL"] = df["TEL"].apply(lambda x: corrector.correct(x, "TEL"))

df["TEL"] = df["WRONG_TEL"].apply(lambda x: x[1])
df["WRONG"] += df["WRONG_TEL"].apply(lambda x: x[0])

In [395]:
df["WRONG_CITY"] = df["CITY"].apply(lambda x: corrector.correct(x, "CITY"))

df["CITY"] = df["WRONG_CITY"].apply(lambda x: x[1])
df["WRONG"] += df["WRONG_CITY"].apply(lambda x: x[0])

In [396]:
df = df.drop_duplicates()

In [397]:
df

Unnamed: 0,EMAIL,NAME,LAST_NAME,TEL,CITY,WRONG_NAME,WRONG,WRONG_LAST_NAME,WRONG_TEL,WRONG_CITY
0,,Ivan,Abramov,7776514,Moscow,"(0, Ivan)",0,"(0, Abramov)","(0, 7776514)","(0, Moscow)"
1,,Alexei,Nikolaev,1348520,Moscow,"(0, Alexei)",0,"(0, Nikolaev)","(0, 1348520)","(0, Moscow)"
2,,Marina,Shapar,11311110,Moscow,"(0, Marina)",1,"(0, Shapar)","(1, 11311110)","(0, Moscow)"
3,,Dmitri,Vasilyev,0000000,Kazan,"(0, Dmitri)",0,"(0, Vasilyev)","(0, 0000000)","(0, Kazan)"
4,,Yekaterina,Ilyina,,Saint petersburg,"(0, Yekaterina)",1,"(0, Ilyina)","(1, nan)","(0, Saint petersburg)"
5,,Anastasia,Grigoryan,1928421,Yekaterinburg,"(0, Anastasia)",0,"(0, Grigoryan)","(0, 1928421)","(0, Yekaterinburg)"
6,,Andrei,Fedorov,85212384,Buinsk,"(0, Andrei)",1,"(0, Fedorov)","(1, 85212384)","(0, Buinsk)"
7,,Alexei,Lisitsyn,1239532,Tver,"(0, Alexei)",0,"(0, Lisitsyn)","(0, 1239532)","(0, Tver)"
8,,Daria,Abramova,7163908,Moscow,"(0, Daria)",0,"(0, Abramova)","(0, 7163908)","(0, Moscow)"
9,,Alexander,Evdokimov,482,Volokolamsk,"(0, Alexander)",1,"(0, Evdokimov)","(1, 482)","(0, Volokolamsk)"


In [398]:
correct_df = df[df.WRONG == 0][['EMAIL', 'NAME', 'LAST_NAME', 'TEL', 'CITY']]

In [399]:
correct_df

Unnamed: 0,EMAIL,NAME,LAST_NAME,TEL,CITY
0,,Ivan,Abramov,7776514,Moscow
1,,Alexei,Nikolaev,1348520,Moscow
3,,Dmitri,Vasilyev,0,Kazan
5,,Anastasia,Grigoryan,1928421,Yekaterinburg
7,,Alexei,Lisitsyn,1239532,Tver
8,,Daria,Abramova,7163908,Moscow
10,,Natalya,Kostina,9031433,Moscow
11,,Nikolai,Ermolin,8539233,Saint petersburg
13,,Vladimir,Solovovo,4758395,Saint petersburg
14,,Vladimir,Ivanov,4827594,Novosibirsk


In [400]:
class SignUpGenerator:
    def __init__(self, pass_len = 8):
        self.emails = set()
        self.PASSWORD_LENGTH = pass_len
        
    def generate_email(self, name, last_name):
        email = name[0] + '.' + last_name + "@companyname.com"
        if email in self.emails:
            email = name[:2] + '.' + last_name + "@companyname.com"
        self.emails.add(email)
        return email
    
    def generate_password(self):
        symbs = string.ascii_letters + string.digits + "-_*;^/"
        password = "".join(random.sample(symbs, self.PASSWORD_LENGTH))
        
        return password

In [401]:
su_gen = SignUpGenerator()

correct_df[['EMAIL', 'PASSWORD']] = correct_df[['NAME', 'LAST_NAME']].apply(lambda x: pd.Series([su_gen.generate_email(x['NAME'], x['LAST_NAME']), su_gen.generate_password()]), axis=1)

In [402]:
correct_df

Unnamed: 0,EMAIL,NAME,LAST_NAME,TEL,CITY,PASSWORD
0,I.Abramov@companyname.com,Ivan,Abramov,7776514,Moscow,TYrXMRDI
1,A.Nikolaev@companyname.com,Alexei,Nikolaev,1348520,Moscow,TF4V31g*
3,D.Vasilyev@companyname.com,Dmitri,Vasilyev,0,Kazan,b1tFT^sA
5,A.Grigoryan@companyname.com,Anastasia,Grigoryan,1928421,Yekaterinburg,7ERPZBmg
7,A.Lisitsyn@companyname.com,Alexei,Lisitsyn,1239532,Tver,yfo0iXnw
8,D.Abramova@companyname.com,Daria,Abramova,7163908,Moscow,bocIqWPT
10,N.Kostina@companyname.com,Natalya,Kostina,9031433,Moscow,enHlEw6G
11,N.Ermolin@companyname.com,Nikolai,Ermolin,8539233,Saint petersburg,URzuavte
13,V.Solovovo@companyname.com,Vladimir,Solovovo,4758395,Saint petersburg,Fqno8B;g
14,V.Ivanov@companyname.com,Vladimir,Ivanov,4827594,Novosibirsk,zQjJ^H;t


In [19]:
correct_df.to_csv('correct_df.csv', index=False)