Create the datasets with Faker

In [1]:
from faker import Faker
import random
import numpy as np
import uuid
import pandas as pd
from datetime import date, datetime

In [2]:
# datasets:
# 1. Customer-list // Support-set
# 2. Contact-list // Support-set
# 3. CRM-data, customer interaction (meetings etc.)
# 4. Offer-data / Sales-data


fake = Faker()

Create a function for the first dataset "Customer-list"

In [3]:
# 1. Customer-list
def create_customer(x): 
    industry = ['Automobile Manufacturers (OEMs)', 'Automotive R&D and Testing Centers',
                'Electric Vehicle (EV) Industry', 'Ridesharing & Mobility Services', 
                'Connected Car Solutions Providers', 'Fleet Management']
    size = ['Small', 'Medium', 'Large']
    
    # Generate unique company IDs
    unique_ids = random.sample(range(1, x * 10), x)  # Ensure enough range to avoid collisions
    
    customer = {}
    for i, company_id in enumerate(unique_ids):
        customer[i] = {}
        customer[i]['companyId'] = company_id
        customer[i]['company'] = fake.company()
        customer[i]['size'] = fake.random_element(size)
        customer[i]['industry'] = fake.random_element(industry)
        customer[i]['city'] = fake.city()
        customer[i]['state'] = fake.state()
        customer[i]['zipcode'] = fake.postcode()
    
    return customer

Create a function for the second dataset "Contact-list"

In [4]:
def create_contacts(customers, num_contacts):
    roles = ['Sales Manager', 'Technical Manager', 'Sales Representative', 
             'Technical Representative', 'CEO', 'CCO', 'CFO']
    
    contacts = []
    existing_contact_ids = set()  # Keep track of existing contact IDs to avoid duplicates
    
    for company in customers.values():
        contact = {
            'companyId': company['companyId'],
            'company': company['company'],
            'contactId': str(uuid.uuid4()),  # Unique contactId using UUID
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'roles': random.choice(roles),
            'contactPerson': f"{fake.first_name()} {fake.last_name()}",
            'email': f"{fake.first_name().lower()}.{fake.last_name().lower()}@{company['company'].lower().replace(' ', '')}.com",
            'phone_number': fake.phone_number()
        }
        
        # Ensure no duplicate contact ID
        while contact['contactId'] in existing_contact_ids:
            contact['contactId'] = str(uuid.uuid4())
        
        existing_contact_ids.add(contact['contactId'])
        contacts.append(contact)
    
    # Add more random contacts if needed
    while len(contacts) < num_contacts:
        customer = random.choice(list(customers.values()))
        contact = {
            'companyId': customer['companyId'],
            'company': customer['company'],
            'contactId': str(uuid.uuid4()),  # Unique contactId using UUID
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'roles': random.choice(roles),
            'contactPerson': f"{fake.first_name()} {fake.last_name()}",
            'email': f"{fake.first_name().lower()}.{fake.last_name().lower()}@{customer['company'].lower().replace(' ', '')}.com",
            'phone_number': fake.phone_number()
        }
        
        # Ensure no duplicate contact ID
        while contact['contactId'] in existing_contact_ids:
            contact['contactId'] = str(uuid.uuid4())
        
        existing_contact_ids.add(contact['contactId'])
        contacts.append(contact)
    
    return contacts

Create a function for the third dataset, "CRM-data"

In [5]:
def create_crm_interaction(customers, contacts, num_interactions):

    # Define possible interaction types and subjects
    interaction_types = ['Digital meeting', 'Physical meeting', 'Email', 'Phone Call']
    interaction_subjects = [
        'Discuss project details',
        'Follow-up on order',
        'Follow-up on proposal',
        'Customer inquiry',
        'Contract negotiation',
        'Product feedback',
        'Product demostration'
    ]
    
    start_date = date(2021,10,30)
    end_date = date(2024,10,30)
    

    # Generate random CRM interactions
    interactions = []
    for _ in range(num_interactions):  # Generate based on num_interactions
        interaction = {}

        # Randomly select a customer
        customer = random.choice(list(customers.values()))
        companyId = customer['companyId']

        # Filter contacts for the selected company
        company_contacts = [c for c in contacts if c['companyId'] == companyId]
        if not company_contacts:  # Skip if no contacts exist for this company
            continue

        # Randomly select a contact from the company contacts
        contact = random.choice(company_contacts)
        
        interaction['company'] = customer['company']
        interaction['companyId'] = customer['companyId']
        interaction['contactId'] = contact['contactId']
        interaction['contactPerson'] = f"{contact['first_name']} {contact['last_name']}"
        interaction['interactionType'] = random.choice(interaction_types)
        interaction['interactionSubject'] = random.choice(interaction_subjects)
        interaction['interactionDate'] = fake.date_between_dates(start_date, end_date)

        interactions.append(interaction)

    return interactions

Create a function for the fourth dataset "Sales-data"

In [6]:
# 4. Sales-data    
def create_sales_data(customers, contacts, num_offers):
    offers = []
    for _ in range(num_offers):
        offer = {}
        offer_status = ['Open', 'Closed']
        offer_rate = [0.55, 0.45]
        start_date = date(2021,10,30)
        end_date = date(2024,10,30)
    
        
        # Randomly select a customer
        customer = random.choice(list(customers.values()))
        companyId = customer['companyId']

        # Filter contacts for the selected company
        company_contacts = [c for c in contacts if c['companyId'] == companyId]
        if not company_contacts:  # Skip if no contacts exist for this company
            continue

        # Randomly select a contact from the company contacts
        contact = random.choice(company_contacts)
        
        offer['offerNo'] = random.randint(1, 5000)
        offer['company'] = customer['company']
        offer['companyId'] = customer['companyId']
        offer['contactId'] = contact['contactId']
        offer['contactPerson'] = f"{contact['first_name']} {contact['last_name']}"
        offer['offerValue'] = round(np.random.normal(10000,1000),2)
        offer['offerDate'] = fake.date_between_dates(start_date, end_date)
        offer['offerStatus'] = np.random.choice(offer_status, p=offer_rate)
        
        offers.append(offer)
        

    return offers

Apply the functions, and create the datasets

In [7]:
customers = create_customer(2000) # How many customers
contacts = create_contacts(customers, 4000) # How many contacts
interaction = create_crm_interaction(customers, contacts, 32000) # How many interactions
offer = create_sales_data(customers, contacts, 40000) # How many offers

df_customers = pd.DataFrame(customers).transpose() 
df_contacts = pd.DataFrame(contacts)
df_interaction = pd.DataFrame(interaction)
df_offer = pd.DataFrame(offer)

Check for duplicates in customer and contacts dfs

In [8]:
print(f"Duplicate companyId in df_customers: {df_customers['companyId'].duplicated().sum()}")
print(f"Duplicate contactId in df_contacts: {df_contacts['contactId'].duplicated().sum()}")

Duplicate companyId in df_customers: 0
Duplicate contactId in df_contacts: 0


Check the created dataframes


In [9]:
df_customers

Unnamed: 0,companyId,company,size,industry,city,state,zipcode
0,7888,"Ramirez, Miller and Reed",Medium,Electric Vehicle (EV) Industry,Jeffreybury,Virginia,87273
1,6486,Evans-Thompson,Medium,Automobile Manufacturers (OEMs),Lake Jeanne,Pennsylvania,02208
2,15559,Collins LLC,Small,Connected Car Solutions Providers,New Amyshire,Connecticut,78135
3,6004,"Lam, Bailey and Fernandez",Large,Automotive R&D and Testing Centers,Robertside,New York,53513
4,2488,Martin Inc,Medium,Connected Car Solutions Providers,Freemanbury,North Carolina,63715
...,...,...,...,...,...,...,...
1995,5156,Jones PLC,Large,Connected Car Solutions Providers,Sawyerton,California,81922
1996,10656,"Becker, Lewis and Zamora",Small,Electric Vehicle (EV) Industry,North Caleb,Kentucky,66423
1997,3974,Love-Jones,Small,Electric Vehicle (EV) Industry,Bondtown,New York,61923
1998,18778,"Murphy, Jordan and Velasquez",Large,Ridesharing & Mobility Services,North Kellyton,North Dakota,97406


In [10]:
df_contacts

Unnamed: 0,companyId,company,contactId,first_name,last_name,roles,contactPerson,email,phone_number
0,7888,"Ramirez, Miller and Reed",7a2cf7e2-1c73-4b78-8b66-b951d251abc8,Tiffany,Wang,Sales Representative,Jodi Stewart,"jessica.brown@ramirez,millerandreed.com",464.440.9455
1,6486,Evans-Thompson,e835c78a-32b1-4a0e-b7e0-b98d27a3a732,Karen,Sanders,Sales Manager,Jeremy Martin,michael.pruitt@evans-thompson.com,001-840-696-3710
2,15559,Collins LLC,1db8096d-0909-4917-bd29-4998b6258828,Stephanie,Browning,CFO,Omar Meadows,michael.brown@collinsllc.com,+1-920-262-6802x8872
3,6004,"Lam, Bailey and Fernandez",67dcb80c-fe73-46b9-921f-fc960d3f80cd,Keith,Boyd,Sales Representative,Luke Smith,"maria.duncan@lam,baileyandfernandez.com",+1-782-698-3575x8878
4,2488,Martin Inc,dcc5e209-838b-4b1e-9cf4-fd272905bc4c,Henry,Jones,Sales Representative,Todd Sawyer,victoria.morrison@martininc.com,(383)732-8018
...,...,...,...,...,...,...,...,...,...
3995,624,"Brown, White and Martinez",60d9c228-0fdb-4bff-bbf9-13ee9cf558a1,Michael,Kent,Sales Representative,Devon Dixon,"melissa.grant@brown,whiteandmartinez.com",(710)916-2355x49499
3996,7308,"Jones, Hardy and Harvey",e8eb22df-a417-4b38-a518-e8a05e3e4395,Felicia,Bailey,Technical Representative,Richard Richardson,"lindsay.jackson@jones,hardyandharvey.com",+1-306-882-9026x5605
3997,5111,Brown Ltd,f30ff650-d761-4055-bd44-a526d117cede,Stephanie,Mckinney,CCO,Jake Lee,mark.sweeney@brownltd.com,822-413-1220x788
3998,1023,Tanner LLC,404768a5-3aa5-46de-a020-0ec418f784a3,Antonio,Curtis,Technical Manager,Sandra Aguilar,jennifer.collins@tannerllc.com,6498790550


In [11]:
df_interaction

Unnamed: 0,company,companyId,contactId,contactPerson,interactionType,interactionSubject,interactionDate
0,Walker Group,19882,bf67622d-e30a-487b-8916-e2948450f101,Shawn Green,Digital meeting,Product feedback,2021-11-06
1,Henderson Inc,11700,4e087457-362d-4cf4-a78b-32cadc6aeccc,Jacob Estrada,Phone Call,Discuss project details,2022-02-02
2,"Allen, Taylor and Kerr",12932,2f85292f-ce28-42d7-89bd-10f13b30f0da,Lee Bryant,Email,Contract negotiation,2022-05-25
3,"Johnson, Cross and Henry",18281,99ecb82d-8463-4f4a-8bd1-0b182c8afd3c,Cody Griffith,Physical meeting,Contract negotiation,2022-07-27
4,Bailey LLC,9514,0c390f89-2c22-44ee-8f6d-7c2217d66346,Derek Woods,Physical meeting,Contract negotiation,2022-11-22
...,...,...,...,...,...,...,...
31995,Walters-Reid,2628,ebaa0d5d-85b2-4510-9c64-ef3dbdf44971,Shelly Coleman,Email,Product feedback,2024-02-17
31996,Walker Ltd,17622,0c00e416-d0de-43fb-8bae-db7c0af0a72a,Alison Williams,Email,Customer inquiry,2024-05-29
31997,Meadows Group,15558,aff2c1e9-e532-4f7d-9f0e-d879d411fa21,Theodore White,Physical meeting,Customer inquiry,2023-05-12
31998,Huffman-Jackson,5029,3f04f05f-001f-4b3f-bea6-285d9c2fc077,Rachel Barnes,Physical meeting,Contract negotiation,2023-07-21


In [12]:
df_offer

Unnamed: 0,offerNo,company,companyId,contactId,contactPerson,offerValue,offerDate,offerStatus
0,4883,Alexander and Sons,13351,4f837552-42d0-4a60-b6db-9e1e9002c6b0,Kristine Rose,12497.74,2024-01-08,Open
1,3459,Fuller and Sons,18486,0f01c50f-542c-4849-8d75-1bf43c08a739,Stephen Calderon,10736.43,2024-07-01,Closed
2,2856,Obrien Inc,592,90dad980-7a87-450b-9408-4dc81041dccd,Sherry Ellis,9566.43,2022-01-04,Open
3,3850,Williams-Garcia,13887,ed2ab6a1-eeb0-4ab6-94fc-253fd4f9c69c,Tammy Singh,10655.95,2022-03-31,Closed
4,1035,Bryant LLC,4896,61d95f82-9c67-41c9-a879-9e3ce950ebf4,Debra Anderson,9584.21,2021-12-18,Open
...,...,...,...,...,...,...,...,...
39995,1515,"Nelson, Harrison and Miller",7526,f38816c6-515a-4adc-9e21-683d7b7bf08f,Jesse Bond,10726.05,2021-11-01,Open
39996,821,"Scott, Smith and Thompson",2910,58e1f45f-97ca-43fd-9089-00a05ef07104,Christopher Grant,10666.87,2022-07-21,Open
39997,2617,Oconnor-Acevedo,7933,4c540cf4-5851-4c3d-9f4e-2494f7728c61,Nicole Mitchell,10913.83,2023-02-26,Open
39998,3279,Potter and Sons,864,f3e50efe-371e-4ab6-869c-ccb3c68ec570,Emily Bell,8837.48,2022-08-19,Closed


Merge customers with contacts


In [14]:
df_merge = pd.merge(df_customers, df_contacts, on=['companyId', 'company'], how='left')
df_merge

Unnamed: 0,companyId,company,size,industry,city,state,zipcode,contactId,first_name,last_name,roles,contactPerson,email,phone_number
0,7888,"Ramirez, Miller and Reed",Medium,Electric Vehicle (EV) Industry,Jeffreybury,Virginia,87273,7a2cf7e2-1c73-4b78-8b66-b951d251abc8,Tiffany,Wang,Sales Representative,Jodi Stewart,"jessica.brown@ramirez,millerandreed.com",464.440.9455
1,7888,"Ramirez, Miller and Reed",Medium,Electric Vehicle (EV) Industry,Jeffreybury,Virginia,87273,1b779442-ee63-4623-a075-0312c0d84b6c,Nancy,Small,CEO,Pamela Ramirez,"scott.lewis@ramirez,millerandreed.com",2265472025
2,6486,Evans-Thompson,Medium,Automobile Manufacturers (OEMs),Lake Jeanne,Pennsylvania,02208,e835c78a-32b1-4a0e-b7e0-b98d27a3a732,Karen,Sanders,Sales Manager,Jeremy Martin,michael.pruitt@evans-thompson.com,001-840-696-3710
3,6486,Evans-Thompson,Medium,Automobile Manufacturers (OEMs),Lake Jeanne,Pennsylvania,02208,efc56df5-05b1-4d61-a3ae-3aee2c618c10,Denise,Gill,Technical Manager,Stephanie Wilson,george.davidson@evans-thompson.com,(562)267-0854
4,6486,Evans-Thompson,Medium,Automobile Manufacturers (OEMs),Lake Jeanne,Pennsylvania,02208,65ab2e0e-8abc-4bd0-88db-922e05c4acc1,Hannah,Becker,Technical Manager,Ian Woods,chris.robles@evans-thompson.com,5869692354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,3974,Love-Jones,Small,Electric Vehicle (EV) Industry,Bondtown,New York,61923,29733022-0ae9-4fe5-93ac-7cbea84b23ea,Carly,Gonzalez,Sales Manager,Ronald Little,dennis.price@love-jones.com,(668)915-6409x9420
3996,18778,"Murphy, Jordan and Velasquez",Large,Ridesharing & Mobility Services,North Kellyton,North Dakota,97406,5c869ebc-66f1-44cd-a54b-fe66e736f363,Thomas,Mccormick,Technical Representative,Eric Johnson,"elizabeth.jimenez@murphy,jordanandvelasquez.com",(829)608-4067
3997,18778,"Murphy, Jordan and Velasquez",Large,Ridesharing & Mobility Services,North Kellyton,North Dakota,97406,869000d7-f99f-4aee-ad1b-639da4c867f3,Samantha,Valencia,Technical Representative,Sarah Ochoa,"nicole.allen@murphy,jordanandvelasquez.com",895-943-0184x0684
3998,12806,"Perez, Rodriguez and Medina",Small,Automotive R&D and Testing Centers,Rossbury,North Dakota,60894,38817d95-7bf1-4623-8f43-9455a855b878,Matthew,Hoffman,CFO,Amanda Mathis,"jason.ibarra@perez,rodriguezandmedina.com",(346)895-0398


In [15]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   companyId      4000 non-null   object
 1   company        4000 non-null   object
 2   size           4000 non-null   object
 3   industry       4000 non-null   object
 4   city           4000 non-null   object
 5   state          4000 non-null   object
 6   zipcode        4000 non-null   object
 7   contactId      4000 non-null   object
 8   first_name     4000 non-null   object
 9   last_name      4000 non-null   object
 10  roles          4000 non-null   object
 11  contactPerson  4000 non-null   object
 12  email          4000 non-null   object
 13  phone_number   4000 non-null   object
dtypes: object(14)
memory usage: 437.6+ KB


Map certain of columns and agg. the data to get a better strucuted data

In [None]:
df_contacts2 = df_merge.groupby(['companyId', 'company', 'size', 'industry', 'city', 'state']).agg(
    contacts=('contactPerson', 'count'),
    roles=('roles', lambda x: ', '.join(x.dropna().astype(str).unique()))
).reset_index()

In [20]:
df_contacts2

Unnamed: 0,companyId,company,size,industry,city,state,contacts,roles
0,13,Stephens-Washington,Large,Ridesharing & Mobility Services,New Yvonneborough,Delaware,2,"Sales Manager, CEO"
1,16,Moore Ltd,Large,Connected Car Solutions Providers,Brianstad,Tennessee,2,"CFO, Sales Representative"
2,23,Payne Group,Large,Automobile Manufacturers (OEMs),North Sylvia,Texas,1,Sales Manager
3,31,"Martinez, Benson and Bowman",Large,Fleet Management,Ginamouth,North Carolina,2,"CCO, CFO"
4,39,Meyer-Gill,Medium,Automotive R&D and Testing Centers,Loganville,Arizona,4,"CEO, Sales Representative, Sales Manager"
...,...,...,...,...,...,...,...,...
1995,19961,Stanley Inc,Small,Fleet Management,Donburgh,Indiana,2,"Sales Manager, CFO"
1996,19975,Hayes PLC,Small,Connected Car Solutions Providers,Thomasmouth,Vermont,4,"Sales Manager, CEO, Technical Manager"
1997,19978,"Parks, Moss and Howell",Large,Fleet Management,Lake Johnview,Colorado,2,Sales Manager
1998,19980,"Nixon, Zimmerman and Washington",Medium,Ridesharing & Mobility Services,New Jasonberg,Vermont,1,Technical Representative


In [21]:
df_interactions2 = df_interaction.groupby(['company', 'companyId']).agg(
    interactions=('interactionType', 'count'),
    lastInteractionDate=('interactionDate', 'max'),
    interactionSubjects=('interactionSubject', lambda x: ', '.join(x.dropna().astype(str).unique()))
).reset_index()

In [22]:
df_interactions2

Unnamed: 0,company,companyId,interactions,lastInteractionDate,interactionSubjects
0,Acevedo Group,15380,14,2024-06-15,"Product demostration, Contract negotiation, Fo..."
1,Acosta LLC,3381,11,2024-10-14,"Contract negotiation, Customer inquiry, Produc..."
2,Acosta LLC,11049,16,2024-10-04,"Product feedback, Follow-up on order, Product ..."
3,Acosta-Osborne,2274,20,2024-10-18,"Follow-up on proposal, Follow-up on order, Dis..."
4,Adams PLC,1919,12,2024-09-13,"Customer inquiry, Discuss project details, Pro..."
...,...,...,...,...,...
1995,Young-Rangel,8510,12,2024-10-11,"Customer inquiry, Contract negotiation, Discus..."
1996,Young-Walter,9988,19,2024-09-13,"Contract negotiation, Customer inquiry, Produc..."
1997,Young-Wilson,2360,12,2024-09-15,"Customer inquiry, Product demostration, Discus..."
1998,Yu Inc,18658,19,2024-08-24,"Follow-up on proposal, Product feedback, Produ..."


In [23]:
df_offer2 = df_offer.groupby(['company', 'companyId']).agg(
    offers=('offerNo', 'count'),
    averageOfferValue=('offerValue', 'mean'),
    offerClosed=('offerStatus', lambda x: ((x =='Closed').sum())),
    lastOfferDate=('offerDate', 'max')
).reset_index().round(2)

In [24]:
df_offer2

Unnamed: 0,company,companyId,offers,averageOfferValue,offerClosed,lastOfferDate
0,Acevedo Group,15380,17,10183.15,8,2024-10-18
1,Acosta LLC,3381,18,9939.37,11,2024-09-30
2,Acosta LLC,11049,19,10162.91,7,2024-10-25
3,Acosta-Osborne,2274,13,10130.35,8,2024-09-25
4,Adams PLC,1919,19,10117.22,13,2024-10-12
...,...,...,...,...,...,...
1995,Young-Rangel,8510,27,10050.65,16,2024-09-06
1996,Young-Walter,9988,17,10336.17,7,2024-07-15
1997,Young-Wilson,2360,21,9941.90,11,2024-07-12
1998,Yu Inc,18658,18,10194.10,5,2024-07-30


Create new columns/features for interaction and offer dataset.
Based on lastInteractionDate and lastOfferDate

In [25]:
df_interactions2['lastInteractionDate'] = pd.to_datetime(df_interactions2['lastInteractionDate'], errors='coerce')
df_offer2['lastOfferDate'] = pd.to_datetime(df_offer2['lastOfferDate'], errors='coerce')

Define a date_measure

In [26]:
date_measure = pd.to_datetime('2024-12-01')

Create the first feature based on lastInteractionDate

In [27]:
df_interactions2['daysSinceLastInteraction'] = (date_measure - df_interactions2['lastInteractionDate']).dt.days.astype(int)

Merge df contacts2 with df interactions2

In [28]:
merge = pd.merge(df_contacts2, df_interactions2, on=['company', 'companyId'], how='left')

Create the final merge on new df "merge" and offer2

In [29]:
merge_full = pd.merge(merge, df_offer2, on=['company', 'companyId'], how='left')

Create the second feature based on lastOfferDate

In [30]:
merge_full['daysSinceLastOfferClosed'] = (date_measure - merge_full['lastOfferDate']).dt.days.astype(int)

Create additional new feature to define the offerHitRate

In [31]:
merge_full['daysSinceLastOfferClosed'] = (date_measure - merge_full['lastOfferDate']).dt.days.astype(int)

Checking the dataset

In [32]:
merge_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   companyId                 2000 non-null   int64         
 1   company                   2000 non-null   object        
 2   size                      2000 non-null   object        
 3   industry                  2000 non-null   object        
 4   city                      2000 non-null   object        
 5   state                     2000 non-null   object        
 6   contacts                  2000 non-null   int64         
 7   roles                     2000 non-null   object        
 8   interactions              2000 non-null   int64         
 9   lastInteractionDate       2000 non-null   datetime64[ns]
 10  interactionSubjects       2000 non-null   object        
 11  daysSinceLastInteraction  2000 non-null   int32         
 12  offers              

Do additional checking based on a certain of criteria

In [33]:
check = merge[(merge['contacts'] == 0) & (merge['interactions'] > 0)]
check

Unnamed: 0,companyId,company,size,industry,city,state,contacts,roles,interactions,lastInteractionDate,interactionSubjects,daysSinceLastInteraction


In [35]:
check2 = merge_full[(merge_full['contacts'] ==0) & (merge_full['offers'] > 0)]
check2

Unnamed: 0,companyId,company,size,industry,city,state,contacts,roles,interactions,lastInteractionDate,interactionSubjects,daysSinceLastInteraction,offers,averageOfferValue,offerClosed,lastOfferDate,daysSinceLastOfferClosed


In [36]:
check3 = merge_full[(merge_full['contacts'] == 0)]
check3

Unnamed: 0,companyId,company,size,industry,city,state,contacts,roles,interactions,lastInteractionDate,interactionSubjects,daysSinceLastInteraction,offers,averageOfferValue,offerClosed,lastOfferDate,daysSinceLastOfferClosed


Get more insights about the dataset

In [37]:
merge_full.describe()

Unnamed: 0,companyId,contacts,interactions,lastInteractionDate,daysSinceLastInteraction,offers,averageOfferValue,offerClosed,lastOfferDate,daysSinceLastOfferClosed
count,2000.0,2000.0,2000.0,2000,2000.0,2000.0,2000.0,2000.0,2000,2000.0
mean,9940.728,2.0,16.0,2024-08-23 02:21:07.200000256,99.902,20.0,10000.49348,8.9925,2024-09-02 11:49:12,89.5075
min,13.0,1.0,4.0,2023-07-04 00:00:00,33.0,7.0,9348.4,1.0,2023-06-04 00:00:00,33.0
25%,4867.75,1.0,13.0,2024-07-30 00:00:00,53.0,17.0,9850.8675,7.0,2024-08-12 00:00:00,49.0
50%,9958.5,2.0,16.0,2024-09-11 12:00:00,80.5,20.0,10003.88,9.0,2024-09-18 12:00:00,73.5
75%,14886.75,3.0,19.0,2024-10-09 00:00:00,124.0,23.0,10151.05,11.0,2024-10-13 00:00:00,111.0
max,19985.0,7.0,29.0,2024-10-29 00:00:00,516.0,36.0,10871.68,19.0,2024-10-29 00:00:00,546.0
std,5753.653107,0.995236,4.02394,,66.294048,4.456673,223.555698,2.986453,,58.080983


Create a function for feature_target

In [38]:
def categorize_customer(i):
    if i <= 7:
        return 'C'
    elif 7 <= i <= 10:
        return 'B'
    else:
        return 'A'

merge_full['customerClassification'] = merge_full['offerClosed'].apply(categorize_customer)

Check the dataset

In [39]:
merge_full

Unnamed: 0,companyId,company,size,industry,city,state,contacts,roles,interactions,lastInteractionDate,interactionSubjects,daysSinceLastInteraction,offers,averageOfferValue,offerClosed,lastOfferDate,daysSinceLastOfferClosed,customerClassification
0,13,Stephens-Washington,Large,Ridesharing & Mobility Services,New Yvonneborough,Delaware,2,"Sales Manager, CEO",20,2024-08-22,"Follow-up on proposal, Follow-up on order, Pro...",101,20,10169.80,10,2024-10-14,48,B
1,16,Moore Ltd,Large,Connected Car Solutions Providers,Brianstad,Tennessee,2,"CFO, Sales Representative",13,2024-08-04,"Follow-up on order, Discuss project details, P...",119,12,9903.13,7,2024-09-13,79,C
2,23,Payne Group,Large,Automobile Manufacturers (OEMs),North Sylvia,Texas,1,Sales Manager,13,2024-10-19,"Contract negotiation, Product feedback, Produc...",43,20,10208.89,9,2024-09-06,86,B
3,31,"Martinez, Benson and Bowman",Large,Fleet Management,Ginamouth,North Carolina,2,"CCO, CFO",12,2024-05-26,"Follow-up on order, Contract negotiation, Foll...",189,17,9955.58,8,2024-07-16,138,B
4,39,Meyer-Gill,Medium,Automotive R&D and Testing Centers,Loganville,Arizona,4,"CEO, Sales Representative, Sales Manager",16,2024-06-10,"Follow-up on order, Discuss project details, C...",174,20,9847.54,12,2024-10-26,36,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,19961,Stanley Inc,Small,Fleet Management,Donburgh,Indiana,2,"Sales Manager, CFO",26,2024-08-08,"Discuss project details, Follow-up on proposal...",115,18,9980.28,6,2024-10-28,34,C
1996,19975,Hayes PLC,Small,Connected Car Solutions Providers,Thomasmouth,Vermont,4,"Sales Manager, CEO, Technical Manager",22,2024-09-10,"Product feedback, Customer inquiry, Contract n...",82,12,10056.84,3,2024-10-18,44,C
1997,19978,"Parks, Moss and Howell",Large,Fleet Management,Lake Johnview,Colorado,2,Sales Manager,19,2024-10-18,"Discuss project details, Customer inquiry, Pro...",44,17,10185.14,5,2024-10-04,58,C
1998,19980,"Nixon, Zimmerman and Washington",Medium,Ridesharing & Mobility Services,New Jasonberg,Vermont,1,Technical Representative,14,2024-09-23,"Product feedback, Discuss project details, Fol...",69,26,10116.77,9,2024-10-09,53,B


Check the classification split for the feature target

In [40]:
customer_C = (merge_full['customerClassification'] == 'C').sum()
customer_B = (merge_full['customerClassification'] == 'B').sum()
customer_A = (merge_full['customerClassification'] == 'A').sum()
print(customer_C, customer_B, customer_A)

653 764 583


Check the datatypes

In [41]:
merge_full.dtypes

companyId                            int64
company                             object
size                                object
industry                            object
city                                object
state                               object
contacts                             int64
roles                               object
interactions                         int64
lastInteractionDate         datetime64[ns]
interactionSubjects                 object
daysSinceLastInteraction             int32
offers                               int64
averageOfferValue                  float64
offerClosed                          int64
lastOfferDate               datetime64[ns]
daysSinceLastOfferClosed             int32
customerClassification              object
dtype: object

In [44]:
merge_full = merge_full.astype({'interactions' : 'int', 'offers' : 'int', 'offerClosed' : 'int', 'daysSinceLastInteraction' : 'int'})
merge_full.dtypes

companyId                            int64
company                             object
size                                object
industry                            object
city                                object
state                               object
contacts                             int64
roles                               object
interactions                         int32
lastInteractionDate         datetime64[ns]
interactionSubjects                 object
daysSinceLastInteraction             int32
offers                               int32
averageOfferValue                  float64
offerClosed                          int32
lastOfferDate               datetime64[ns]
daysSinceLastOfferClosed             int32
customerClassification              object
dtype: object

Check the dataset again

In [45]:
merge_full.info()
merge_full.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   companyId                 2000 non-null   int64         
 1   company                   2000 non-null   object        
 2   size                      2000 non-null   object        
 3   industry                  2000 non-null   object        
 4   city                      2000 non-null   object        
 5   state                     2000 non-null   object        
 6   contacts                  2000 non-null   int64         
 7   roles                     2000 non-null   object        
 8   interactions              2000 non-null   int32         
 9   lastInteractionDate       2000 non-null   datetime64[ns]
 10  interactionSubjects       2000 non-null   object        
 11  daysSinceLastInteraction  2000 non-null   int32         
 12  offers              

Unnamed: 0,companyId,contacts,interactions,lastInteractionDate,daysSinceLastInteraction,offers,averageOfferValue,offerClosed,lastOfferDate,daysSinceLastOfferClosed
count,2000.0,2000.0,2000.0,2000,2000.0,2000.0,2000.0,2000.0,2000,2000.0
mean,9940.728,2.0,16.0,2024-08-23 02:21:07.200000256,99.902,20.0,10000.49348,8.9925,2024-09-02 11:49:12,89.5075
min,13.0,1.0,4.0,2023-07-04 00:00:00,33.0,7.0,9348.4,1.0,2023-06-04 00:00:00,33.0
25%,4867.75,1.0,13.0,2024-07-30 00:00:00,53.0,17.0,9850.8675,7.0,2024-08-12 00:00:00,49.0
50%,9958.5,2.0,16.0,2024-09-11 12:00:00,80.5,20.0,10003.88,9.0,2024-09-18 12:00:00,73.5
75%,14886.75,3.0,19.0,2024-10-09 00:00:00,124.0,23.0,10151.05,11.0,2024-10-13 00:00:00,111.0
max,19985.0,7.0,29.0,2024-10-29 00:00:00,516.0,36.0,10871.68,19.0,2024-10-29 00:00:00,546.0
std,5753.653107,0.995236,4.02394,,66.294048,4.456673,223.555698,2.986453,,58.080983


Drop redundant features

In [46]:
merge_full = merge_full.drop(columns=['companyId', 'offers', 'lastInteractionDate', 'lastOfferDate'])

Generate a CSV.file

In [47]:
merge_full.to_csv('sum_customer_data.csv', index=False)