## analysis points
* added header **is_buisness** as a flag and checking email domain is part of public email eg.gmail,hotmail,etc
* added header **nri** as a flag and checking based on mobile country code
* added header **dead_email** as a flag and checking based on email deliveribility status check

In [2]:
import pandas as pd

file_path = 'epn_cleaned.csv'

df = pd.read_csv(file_path)

print(df.head().to_markdown())

|    | FIRSTNAME    | LASTNAME                |      BIN | MOBILE         | EMAIL                      |   PRIMARYCARD | CARD_CREATION_DATE   |   ACCOUNTSERNO |   CARDSERNO |   RMNAME |
|---:|:-------------|:------------------------|---------:|:---------------|:---------------------------|--------------:|:---------------------|---------------:|------------:|---------:|
|  0 | unidentified | ANSH                    | 42113700 | (91)9398742700 | anshsinghvee@gmail.com     |             0 | 10Jul2023            |       27965111 |    37346654 |      nan |
|  1 | unidentified | ANUSH                   | 42113700 | (91)9626999111 | aswin@herofashion.com      |             0 | 26Nov2021            |       29337588 |    20615046 |      nan |
|  2 | unidentified | GOUTHAM                 | 42113700 | (91)9008839300 | samarath@prithvijewels.com |             0 | 12Jul2022            |       39259747 |    26753794 |      nan |
|  3 | unidentified | LAKSHMI PRABHA SELVARAJ | 42113700 | (91)9003711

In [3]:
df_copy = df.copy()

business_domains = ['hotmail.com', 'gmail.com', 'outlook.com','yahoo.com'] 
df_copy['is_business'] = df_copy['EMAIL'].apply(lambda x: 0 if any(domain in x.lower() for domain in business_domains) else 1)

print(df_copy.head().to_markdown())

|    | FIRSTNAME    | LASTNAME                |      BIN | MOBILE         | EMAIL                      |   PRIMARYCARD | CARD_CREATION_DATE   |   ACCOUNTSERNO |   CARDSERNO |   RMNAME |   is_business |
|---:|:-------------|:------------------------|---------:|:---------------|:---------------------------|--------------:|:---------------------|---------------:|------------:|---------:|--------------:|
|  0 | unidentified | ANSH                    | 42113700 | (91)9398742700 | anshsinghvee@gmail.com     |             0 | 10Jul2023            |       27965111 |    37346654 |      nan |             0 |
|  1 | unidentified | ANUSH                   | 42113700 | (91)9626999111 | aswin@herofashion.com      |             0 | 26Nov2021            |       29337588 |    20615046 |      nan |             1 |
|  2 | unidentified | GOUTHAM                 | 42113700 | (91)9008839300 | samarath@prithvijewels.com |             0 | 12Jul2022            |       39259747 |    26753794 |      nan |       

In [4]:
df_copy['nri'] = df_copy['MOBILE'].apply(lambda x: 0 if x[:2] != '91' else 1)

print(df_copy.head().to_markdown())

|    | FIRSTNAME    | LASTNAME                |      BIN | MOBILE         | EMAIL                      |   PRIMARYCARD | CARD_CREATION_DATE   |   ACCOUNTSERNO |   CARDSERNO |   RMNAME |   is_business |   nri |
|---:|:-------------|:------------------------|---------:|:---------------|:---------------------------|--------------:|:---------------------|---------------:|------------:|---------:|--------------:|------:|
|  0 | unidentified | ANSH                    | 42113700 | (91)9398742700 | anshsinghvee@gmail.com     |             0 | 10Jul2023            |       27965111 |    37346654 |      nan |             0 |     0 |
|  1 | unidentified | ANUSH                   | 42113700 | (91)9626999111 | aswin@herofashion.com      |             0 | 26Nov2021            |       29337588 |    20615046 |      nan |             1 |     0 |
|  2 | unidentified | GOUTHAM                 | 42113700 | (91)9008839300 | samarath@prithvijewels.com |             0 | 12Jul2022            |       39259747 |

In [6]:
from email_validator import validate_email

try:
    df_copy['dead_mail'] = df_copy['EMAIL'].apply(lambda x: 1 if not validate_email(x) else 0)
    df_copy['not_deliverable'] = df_copy['EMAIL'].apply(lambda x: 1 if validate_email(x, check_deliverability=True) == False else 0)
except Exception as e:
    print(f"An error occurred: {e}")

print(df_copy.head().to_markdown())


An error occurred: The email address contains invalid characters before the @-sign: ','.
|    | FIRSTNAME    | LASTNAME                |      BIN | MOBILE         | EMAIL                      |   PRIMARYCARD | CARD_CREATION_DATE   |   ACCOUNTSERNO |   CARDSERNO |   RMNAME |   is_business |   nri |
|---:|:-------------|:------------------------|---------:|:---------------|:---------------------------|--------------:|:---------------------|---------------:|------------:|---------:|--------------:|------:|
|  0 | unidentified | ANSH                    | 42113700 | (91)9398742700 | anshsinghvee@gmail.com     |             0 | 10Jul2023            |       27965111 |    37346654 |      nan |             0 |     0 |
|  1 | unidentified | ANUSH                   | 42113700 | (91)9626999111 | aswin@herofashion.com      |             0 | 26Nov2021            |       29337588 |    20615046 |      nan |             1 |     0 |
|  2 | unidentified | GOUTHAM                 | 42113700 | (91)90088393

In [62]:
import openai
import pandas as pd
import os
from dotenv import load_dotenv
import ast

load_dotenv()

client = OpenAI(api_key=os.getenv("OPEN_API_KEY"))

def get_keywords(row):
    try:
        response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
              "role": "system",
              "content": """You will be provided with a buisness emails of customer and your task is to extract a list of keywords
              from it which describes the customer buisness domain,like fashion,jwellery,sports etc,
              also if email username contains designation like ceo@email.com or director@...then pickup designation also as keyword like eg. ceo,director,etc
              give output keywords as json dont add any thing in output.,give only 2 keywords max not more than 2,
              exclude keywords like proffessional,efficient,sophisticated,friendly, approachable etc"""
            },
            {
              "role": "user",
              "content": row['EMAIL']
            }
          ],
          temperature=0.9,
          max_tokens=64,
          top_p=1
        )

        keywords = ast.literal_eval(response.dict()['choices'][0]['message']['content'])['keywords']
        return ', '.join(keywords[:2]) if keywords else None
    except Exception as e:
        return None

df_copy['keywords'] = df_copy.apply(lambda row: get_keywords(row) if row['is_business'] == 1 and row.name < 30 else None, axis=1)


print(df_copy.head(30).to_markdown())



/tmp/ipykernel_39048/1492599228.py:34: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  keywords = ast.literal_eval(response.dict()['choices'][0]['message']['content'])['keywords']


|    | FIRSTNAME        | LASTNAME                |      BIN | MOBILE         | EMAIL                        |   PRIMARYCARD | CARD_CREATION_DATE   |   ACCOUNTSERNO |   CARDSERNO |   RMNAME |   is_business |   nri | keywords                  |
|---:|:-----------------|:------------------------|---------:|:---------------|:-----------------------------|--------------:|:---------------------|---------------:|------------:|---------:|--------------:|------:|:--------------------------|
|  0 | unidentified     | ANSH                    | 42113700 | (91)9398742700 | anshsinghvee@gmail.com       |             0 | 10Jul2023            |       27965111 |    37346654 |      nan |             0 |     0 |                           |
|  1 | unidentified     | ANUSH                   | 42113700 | (91)9626999111 | aswin@herofashion.com        |             0 | 26Nov2021            |       29337588 |    20615046 |      nan |             1 |     0 | fashion, CEO              |
|  2 | unidentified     