#### Importing required libraries

In [1]:
import numpy as  np 
import pandas as pd
from faker import Faker
from os import path
import os

#### Using faker to generate the sample data for Customer

In [2]:
# Initialize Faker
fake = Faker()

# Set seed for reproducibility
np.random.seed(42)
Faker.seed(42)

#### Generating Sample data with 500 rows

In [3]:
# Generate 500 rows of data
num_rows = 500

#### Customer Demographics

In [4]:
# Sheet 1: Customer Demographics
customer_data = []
for i in range(1, num_rows + 1):
    customer_data.append({
        'customer_id': i,
        'name': fake.name(),
        'age': np.random.randint(18, 65),
        'gender': np.random.choice(['Male', 'Female', 'Non-Binary']),
        'email': fake.email(),
        'city': fake.city(),
        'education': np.random.choice(['High School', 'Bachelor\'s Degree', 'Master\'s Degree', 'PhD']),
        'occupation': fake.job(),
        'income_level': np.random.choice(['Low', 'Medium', 'High']),
        'address': fake.address(),
        'timestamp': pd.Timestamp.now()
    })

df_customer = pd.DataFrame(customer_data)

In [5]:
df_customer

Unnamed: 0,customer_id,name,age,gender,email,city,education,occupation,income_level,address,timestamp
0,1,Allison Hill,56,Male,donaldgarcia@example.net,New Roberttown,Master's Degree,Chief Financial Officer,High,"386 Shane Harbors\nPort Lindachester, MA 36922",2025-03-24 18:03:52.141150
1,2,Tyler Rogers,25,Male,jamesmichael@example.com,Lindsaymouth,High School,Software engineer,High,"84959 Janet Cape Apt. 413\nSouth Joshuastad, G...",2025-03-24 18:03:52.143156
2,3,Michael Miles,36,Non-Binary,lynchgeorge@example.net,East Steven,Master's Degree,Geophysicist/field seismologist,High,Unit 8350 Box 3056\nDPO AA 09176,2025-03-24 18:03:52.143156
3,4,Tommy Walter,41,Male,jason76@example.net,Thomasberg,PhD,Fine artist,High,"969 Cox Dam Suite 101\nLake Ernest, TX 55834",2025-03-24 18:03:52.144150
4,5,Janice Carlson,39,Male,jrice@example.org,Lake Nicoleview,Bachelor's Degree,"Librarian, public",Medium,"70482 Monica Hills Apt. 252\nNew Mariotown, DE...",2025-03-24 18:03:52.144150
...,...,...,...,...,...,...,...,...,...,...,...
495,496,Kristine Schmidt,24,Male,mcbridemichael@example.org,West Erik,Bachelor's Degree,Diagnostic radiographer,Low,"289 Garrison Harbors\nSouth Kennethhaven, MT 6...",2025-03-24 18:03:52.452190
496,497,Chad Hurley,55,Male,jonesjeanette@example.net,Jasonland,PhD,Barrister's clerk,High,"72362 Myers Fields\nPort Michael, IN 36705",2025-03-24 18:03:52.453175
497,498,James Barber,39,Female,jamiethomas@example.com,Alexville,High School,"Scientist, research (life sciences)",Low,"973 Evans Crossing\nHernandezmouth, FM 36166",2025-03-24 18:03:52.453175
498,499,Amanda Massey,19,Non-Binary,daniel76@example.net,Lake Heather,PhD,"Radiographer, therapeutic",Low,"119 Allen Vista\nCopelandchester, MT 07926",2025-03-24 18:03:52.454156


#### Customer Transactions

In [6]:
# Sheet 2: Transaction History
transaction_data = []
for i in range(1, num_rows + 1):
    for _ in range(np.random.randint(1, 10)):  # Each customer has 1-10 transactions
        transaction_data.append({
            'transaction_id': fake.unique.random_number(digits=5),
            'customer_id': i,
            'transaction_type': np.random.choice(['Online', 'In-store']),
            'category': np.random.choice(['Electronics', 'Fitness', 'Books', 'Fashion', 'Groceries']),
            'amount': np.random.randint(10, 1000),
            'purchase_mode': np.random.choice(['Credit Card', 'Debit Card', 'PayPal', 'Cash']),
            'purchase_date': fake.date_between(start_date='-1y', end_date='today'),
            'timestamp': pd.Timestamp.now()
        })

df_transaction = pd.DataFrame(transaction_data)

In [7]:
df_transaction

Unnamed: 0,transaction_id,customer_id,transaction_type,category,amount,purchase_mode,purchase_date,timestamp
0,14129,1,In-store,Books,310,Credit Card,2024-11-07,2025-03-24 18:03:52.507960
1,84237,1,Online,Fashion,111,Credit Card,2024-04-30,2025-03-24 18:03:52.508960
2,75448,2,Online,Books,97,Credit Card,2024-04-28,2025-03-24 18:03:52.508960
3,15397,2,Online,Fashion,898,Debit Card,2025-01-13,2025-03-24 18:03:52.508960
4,31275,2,In-store,Fashion,669,Credit Card,2024-08-10,2025-03-24 18:03:52.508960
...,...,...,...,...,...,...,...,...
2518,37316,499,Online,Fitness,96,Debit Card,2025-02-07,2025-03-24 18:03:52.827360
2519,45178,500,In-store,Fashion,842,Debit Card,2024-08-28,2025-03-24 18:03:52.827360
2520,86977,500,Online,Books,124,Debit Card,2024-05-07,2025-03-24 18:03:52.827360
2521,18672,500,Online,Fitness,700,Cash,2025-03-08,2025-03-24 18:03:52.827360


#### Customer  Social Media Sentiments and Intent

In [8]:
# Sheet 3: Social Media Sentiments and Intent
social_media_data = []
for i in range(1, num_rows + 1):
    social_media_data.append({
        'customer_id': i,
        'platform': np.random.choice(['Twitter', 'Facebook', 'Instagram', 'LinkedIn']),
        'post_text': fake.sentence(),
        #'timestamp': fake.datetime_between(start_date='-1y', end_date='today'),
        'sentiment_score': round(np.random.uniform(0, 1), 2),
        'intent': np.random.choice(['Purchase Intent', 'Engagement', 'Brand Awareness'])
    })

df_social_media = pd.DataFrame(social_media_data)

In [9]:
df_social_media

Unnamed: 0,customer_id,platform,post_text,sentiment_score,intent
0,1,Instagram,Its air source six image several name chair.,0.72,Engagement
1,2,LinkedIn,Behavior nearly of general piece.,0.85,Engagement
2,3,Instagram,Address loss international public rate.,0.27,Purchase Intent
3,4,Instagram,Short southern finish front.,0.84,Engagement
4,5,Instagram,Edge court environment must.,0.46,Purchase Intent
...,...,...,...,...,...
495,496,Twitter,Some between be without.,0.54,Brand Awareness
496,497,Instagram,Most whole among recent.,0.15,Engagement
497,498,LinkedIn,Indeed truth pass prove through together lawyer.,0.60,Purchase Intent
498,499,Instagram,Television machine really.,0.01,Brand Awareness


#### Customer Organizational Info

In [10]:

# Sheet 4: Organizational Info
org_data = []
for i in range(1, num_rows + 1):
    org_data.append({
        'customer_id': i,
        'organization_name': fake.company(),
        'industry': np.random.choice(['Technology', 'Healthcare', 'Education', 'Retail', 'Energy']),
        'revenue': np.random.randint(100000, 10000000),
        'no_of_employees': np.random.randint(10, 1000),
        'customer_role': fake.job(),
        'timestamp': pd.Timestamp.now()
    })

df_org = pd.DataFrame(org_data)

In [11]:
df_org 

Unnamed: 0,customer_id,organization_name,industry,revenue,no_of_employees,customer_role,timestamp
0,1,"Blackwell, Duran and Snyder",Technology,5921934,803,Drilling engineer,2025-03-24 18:03:52.926122
1,2,Thomas Group,Energy,8076726,39,Network engineer,2025-03-24 18:03:52.926122
2,3,"Scott, Robinson and Gibson",Healthcare,2207719,501,"Development worker, community",2025-03-24 18:03:52.926122
3,4,Ward LLC,Technology,7702495,927,Dealer,2025-03-24 18:03:52.926122
4,5,Rodriguez and Sons,Education,799438,608,Mudlogger,2025-03-24 18:03:52.926122
...,...,...,...,...,...,...,...
495,496,"Macias, Porter and Morgan",Technology,3350133,506,Forest/woodland manager,2025-03-24 18:03:53.063634
496,497,Graham Inc,Healthcare,2860620,53,Theatre director,2025-03-24 18:03:53.063634
497,498,"Huffman, Allen and Harrington",Education,3499584,837,Futures trader,2025-03-24 18:03:53.064634
498,499,Bruce Group,Energy,9520626,463,"Engineer, materials",2025-03-24 18:03:53.064634


#### Customer Preferences Data

In [12]:
# Sheet 5: Customer Preferences
preference_data = []
for i in range(1, num_rows + 1):
    preference_data.append({
        'customer_id': i,
        'preference_category': np.random.choice(['Electronics', 'Books', 'Fitness', 'Fashion', 'Groceries']),
        'preferred_brands': ', '.join(fake.words(nb=3)),
        'preferred_price_range': f"{np.random.randint(10, 500)}-{np.random.randint(500, 1000)}",
        'timestamp': pd.Timestamp.now()
    })

df_preference = pd.DataFrame(preference_data)

In [13]:
df_preference

Unnamed: 0,customer_id,preference_category,preferred_brands,preferred_price_range,timestamp
0,1,Fitness,"Congress, relationship, compare",291-969,2025-03-24 18:03:53.098483
1,2,Groceries,"poor, lot, blood",237-875,2025-03-24 18:03:53.098483
2,3,Books,"avoid, let, he",184-607,2025-03-24 18:03:53.098483
3,4,Fitness,"thought, international, building",80-903,2025-03-24 18:03:53.098483
4,5,Books,"man, institution, arm",104-778,2025-03-24 18:03:53.098483
...,...,...,...,...,...
495,496,Groceries,"right, character, campaign",47-854,2025-03-24 18:03:53.126481
496,497,Fashion,"last, level, data",240-977,2025-03-24 18:03:53.126481
497,498,Fashion,"away, though, media",188-856,2025-03-24 18:03:53.126481
498,499,Fitness,"whether, western, I",195-940,2025-03-24 18:03:53.126481


#### Save data to CSV file

In [15]:
# Save to CSV
file_path='SourceData'
if path.exists(file_path):
    if not os.path.exists(file_path):
        df_customer.to_csv('SourceData/customer_demographics.csv', index=False)
    else:
        print('customer_demographics.csv exists')
    if not os.path.exists(file_path):
        df_transaction.to_csv('SourceData/transaction_history.csv', index=False)
    else:
        print('transaction_history.csv exists')
    if not os.path.exists(file_path):
        df_social_media.to_csv('SourceData/social_media_sentiments.csv', index=False)
    else:
        print('social_media_sentiments.csv exists')
    if not os.path.exists(file_path):
        df_org.to_csv('SourceData/organizational_info.csv', index=False)
    else:
        print('organizational_info.csv exists')
    if not os.path.exists(file_path):
        df_preference.to_csv('SourceData/customer_preferences.csv', index=False)
    else:
        print('customer_preferences.csv exists')

customer_demographics.csv exists
transaction_history.csv exists
social_media_sentiments.csv exists
organizational_info.csv exists
customer_preferences.csv exists
