#### Data Preprocessing & Creating a Common Data Source

##### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os

#### 1. Loading Dataset

In [2]:
# Load datasets
if os.path.isfile('SourceData/customer_demographics.csv'):   
    df_customer = pd.read_csv('SourceData/customer_demographics.csv')
else:
    print("Error: customer_demographics.csv file not found")
    exit(1)
if os.path.isfile('SourceData/transaction_history.csv'): 
    df_transaction = pd.read_csv('SourceData/transaction_history.csv')
else:
    print("Error: transaction_history.csv file not found")
    exit(1)
if os.path.isfile('SourceData/social_media_sentiments.csv'): 
    df_social = pd.read_csv('SourceData/social_media_sentiments.csv')
else:
    print("Error: social_media_sentiments.csv file not found")
    exit(1)
if os.path.isfile('SourceData/organizational_info.csv'): 
    df_org = pd.read_csv('SourceData/organizational_info.csv')
else:
    print("Error: organizational_info.csv file not found")
    exit(1)
if os.path.isfile('SourceData/customer_preferences.csv'): 
    df_pref = pd.read_csv('SourceData/customer_preferences.csv')
else:
    print("Error: customer_preferences.csv file not found")
    exit(1)

##### Checking  missing values

In [3]:
# Check missing values
print("Missing Values:")
print(df_customer.isnull().sum())
print(df_transaction.isnull().sum())
print(df_social.isnull().sum())
print(df_org.isnull().sum())
print(df_pref.isnull().sum())

Missing Values:
customer_id     0
name            0
age             0
gender          0
email           0
city            0
education       0
occupation      0
income_level    0
address         0
dtype: int64
transaction_id      0
customer_id         0
transaction_type    0
category            0
amount              0
purchase_mode       0
purchase_date       0
dtype: int64
customer_id        0
platform           0
post_text          0
sentiment_score    0
intent             0
dtype: int64
customer_id          0
organization_name    0
industry             0
revenue              0
no_of_employees      0
customer_role        0
timestamp            0
dtype: int64
customer_id              0
preference_category      0
preferred_brands         0
preferred_price_range    0
timestamp                0
dtype: int64


There are no null values in the input data

##### Check Duplicates

In [4]:
# Check duplicates
print("\nDuplicate Rows:")
print(df_customer.duplicated().sum())
print(df_transaction.duplicated().sum())
print(df_social.duplicated().sum())
print(df_org.duplicated().sum())
print(df_pref.duplicated().sum())


Duplicate Rows:
0
0
0
0
0


There are no duplicates in the source data

#### 2. Data Cleaning
Handling Missing Values
If missing values are found, we can:

Drop rows with missing critical data (df.dropna()).

Fill missing numerical values with mean/median (df.fillna()).

Fill missing categorical values with mode or "Unknown."

In [None]:
#df_transaction.drop_duplicates(inplace=True)

##### Standardizing Categorical Data
Convert gender to lowercase.

Standardize education levels.

In [5]:
df_customer

Unnamed: 0,customer_id,name,age,gender,email,city,education,occupation,income_level,address
0,1,Allison Hill,56,Male,donaldgarcia@example.net,New Roberttown,Master's Degree,Chief Financial Officer,High,"386 Shane Harbors\nPort Lindachester, MA 36922"
1,2,Tyler Rogers,25,Male,jamesmichael@example.com,Lindsaymouth,High School,Software engineer,High,"84959 Janet Cape Apt. 413\nSouth Joshuastad, G..."
2,3,Michael Miles,36,Non-Binary,lynchgeorge@example.net,East Steven,Master's Degree,Geophysicist/field seismologist,High,Unit 8350 Box 3056\nDPO AA 09176
3,4,Tommy Walter,41,Male,jason76@example.net,Thomasberg,PhD,Fine artist,High,"969 Cox Dam Suite 101\nLake Ernest, TX 55834"
4,5,Janice Carlson,39,Male,jrice@example.org,Lake Nicoleview,Bachelor's Degree,"Librarian, public",Medium,"70482 Monica Hills Apt. 252\nNew Mariotown, DE..."
...,...,...,...,...,...,...,...,...,...,...
495,496,Kristine Schmidt,24,Male,mcbridemichael@example.org,West Erik,Bachelor's Degree,Diagnostic radiographer,Low,"289 Garrison Harbors\nSouth Kennethhaven, MT 6..."
496,497,Chad Hurley,55,Male,jonesjeanette@example.net,Jasonland,PhD,Barrister's clerk,High,"72362 Myers Fields\nPort Michael, IN 36705"
497,498,James Barber,39,Female,jamiethomas@example.com,Alexville,High School,"Scientist, research (life sciences)",Low,"973 Evans Crossing\nHernandezmouth, FM 36166"
498,499,Amanda Massey,19,Non-Binary,daniel76@example.net,Lake Heather,PhD,"Radiographer, therapeutic",Low,"119 Allen Vista\nCopelandchester, MT 07926"


In [6]:
df_customer['gender'] = df_customer['gender'].str.lower()
df_customer['education'] = df_customer['education'].replace({
    "Bachelor's Degree": "Bachelors",
    "Master's Degree": "Masters",
    "High School": "HighSchool"
})

In [7]:
df_customer

Unnamed: 0,customer_id,name,age,gender,email,city,education,occupation,income_level,address
0,1,Allison Hill,56,male,donaldgarcia@example.net,New Roberttown,Masters,Chief Financial Officer,High,"386 Shane Harbors\nPort Lindachester, MA 36922"
1,2,Tyler Rogers,25,male,jamesmichael@example.com,Lindsaymouth,HighSchool,Software engineer,High,"84959 Janet Cape Apt. 413\nSouth Joshuastad, G..."
2,3,Michael Miles,36,non-binary,lynchgeorge@example.net,East Steven,Masters,Geophysicist/field seismologist,High,Unit 8350 Box 3056\nDPO AA 09176
3,4,Tommy Walter,41,male,jason76@example.net,Thomasberg,PhD,Fine artist,High,"969 Cox Dam Suite 101\nLake Ernest, TX 55834"
4,5,Janice Carlson,39,male,jrice@example.org,Lake Nicoleview,Bachelors,"Librarian, public",Medium,"70482 Monica Hills Apt. 252\nNew Mariotown, DE..."
...,...,...,...,...,...,...,...,...,...,...
495,496,Kristine Schmidt,24,male,mcbridemichael@example.org,West Erik,Bachelors,Diagnostic radiographer,Low,"289 Garrison Harbors\nSouth Kennethhaven, MT 6..."
496,497,Chad Hurley,55,male,jonesjeanette@example.net,Jasonland,PhD,Barrister's clerk,High,"72362 Myers Fields\nPort Michael, IN 36705"
497,498,James Barber,39,female,jamiethomas@example.com,Alexville,HighSchool,"Scientist, research (life sciences)",Low,"973 Evans Crossing\nHernandezmouth, FM 36166"
498,499,Amanda Massey,19,non-binary,daniel76@example.net,Lake Heather,PhD,"Radiographer, therapeutic",Low,"119 Allen Vista\nCopelandchester, MT 07926"


#### 3. Feature Engineering
Customer Lifetime Value (CLV) :
- Calculate total spend per customer

In [8]:
df_transaction

Unnamed: 0,transaction_id,customer_id,transaction_type,category,amount,purchase_mode,purchase_date
0,14129,1,In-store,Books,310,Credit Card,2024-11-07
1,84237,1,Online,Fashion,111,Credit Card,2024-04-30
2,75448,2,Online,Books,97,Credit Card,2024-04-28
3,15397,2,Online,Fashion,898,Debit Card,2025-01-13
4,31275,2,In-store,Fashion,669,Credit Card,2024-08-10
...,...,...,...,...,...,...,...
2518,37316,499,Online,Fitness,96,Debit Card,2025-02-07
2519,45178,500,In-store,Fashion,842,Debit Card,2024-08-28
2520,86977,500,Online,Books,124,Debit Card,2024-05-07
2521,18672,500,Online,Fitness,700,Cash,2025-03-08


In [9]:
clv = df_transaction.groupby('customer_id')['amount'].sum().reset_index()
clv.columns = ['customer_id', 'total_spend']
df_customer = pd.merge(df_customer, clv, on='customer_id', how='left')

In [8]:
df_customer 

Unnamed: 0,customer_id,name,age,gender,email,city,education,occupation,income_level,address,total_spend
0,1,Allison Hill,56,male,donaldgarcia@example.net,New Roberttown,Masters,Chief Financial Officer,High,"386 Shane Harbors\nPort Lindachester, MA 36922",421
1,2,Tyler Rogers,25,male,jamesmichael@example.com,Lindsaymouth,HighSchool,Software engineer,High,"84959 Janet Cape Apt. 413\nSouth Joshuastad, G...",4217
2,3,Michael Miles,36,non-binary,lynchgeorge@example.net,East Steven,Masters,Geophysicist/field seismologist,High,Unit 8350 Box 3056\nDPO AA 09176,3750
3,4,Tommy Walter,41,male,jason76@example.net,Thomasberg,PhD,Fine artist,High,"969 Cox Dam Suite 101\nLake Ernest, TX 55834",4343
4,5,Janice Carlson,39,male,jrice@example.org,Lake Nicoleview,Bachelors,"Librarian, public",Medium,"70482 Monica Hills Apt. 252\nNew Mariotown, DE...",4424
...,...,...,...,...,...,...,...,...,...,...,...
495,496,Kristine Schmidt,24,male,mcbridemichael@example.org,West Erik,Bachelors,Diagnostic radiographer,Low,"289 Garrison Harbors\nSouth Kennethhaven, MT 6...",2215
496,497,Chad Hurley,55,male,jonesjeanette@example.net,Jasonland,PhD,Barrister's clerk,High,"72362 Myers Fields\nPort Michael, IN 36705",1376
497,498,James Barber,39,female,jamiethomas@example.com,Alexville,HighSchool,"Scientist, research (life sciences)",Low,"973 Evans Crossing\nHernandezmouth, FM 36166",5190
498,499,Amanda Massey,19,non-binary,daniel76@example.net,Lake Heather,PhD,"Radiographer, therapeutic",Low,"119 Allen Vista\nCopelandchester, MT 07926",4190


Purchase Frequency:
- Count transactions per customer.

In [10]:
purchase_freq = df_transaction.groupby('customer_id')['transaction_id'].count().reset_index()
purchase_freq.columns = ['customer_id', 'purchase_frequency']
df_customer = pd.merge(df_customer, purchase_freq, on='customer_id', how='left')

In [11]:
df_customer

Unnamed: 0,customer_id,name,age,gender,email,city,education,occupation,income_level,address,total_spend,purchase_frequency
0,1,Allison Hill,56,male,donaldgarcia@example.net,New Roberttown,Masters,Chief Financial Officer,High,"386 Shane Harbors\nPort Lindachester, MA 36922",421,2
1,2,Tyler Rogers,25,male,jamesmichael@example.com,Lindsaymouth,HighSchool,Software engineer,High,"84959 Janet Cape Apt. 413\nSouth Joshuastad, G...",4217,8
2,3,Michael Miles,36,non-binary,lynchgeorge@example.net,East Steven,Masters,Geophysicist/field seismologist,High,Unit 8350 Box 3056\nDPO AA 09176,3750,7
3,4,Tommy Walter,41,male,jason76@example.net,Thomasberg,PhD,Fine artist,High,"969 Cox Dam Suite 101\nLake Ernest, TX 55834",4343,6
4,5,Janice Carlson,39,male,jrice@example.org,Lake Nicoleview,Bachelors,"Librarian, public",Medium,"70482 Monica Hills Apt. 252\nNew Mariotown, DE...",4424,7
...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Kristine Schmidt,24,male,mcbridemichael@example.org,West Erik,Bachelors,Diagnostic radiographer,Low,"289 Garrison Harbors\nSouth Kennethhaven, MT 6...",2215,4
496,497,Chad Hurley,55,male,jonesjeanette@example.net,Jasonland,PhD,Barrister's clerk,High,"72362 Myers Fields\nPort Michael, IN 36705",1376,3
497,498,James Barber,39,female,jamiethomas@example.com,Alexville,HighSchool,"Scientist, research (life sciences)",Low,"973 Evans Crossing\nHernandezmouth, FM 36166",5190,8
498,499,Amanda Massey,19,non-binary,daniel76@example.net,Lake Heather,PhD,"Radiographer, therapeutic",Low,"119 Allen Vista\nCopelandchester, MT 07926",4190,8


In [12]:
avg_sentiment = df_social.groupby('customer_id')['sentiment_score'].mean().reset_index()
avg_sentiment.columns = ['customer_id', 'avg_sentiment']
df_customer = pd.merge(df_customer, avg_sentiment, on='customer_id', how='left')

In [13]:
df_customer

Unnamed: 0,customer_id,name,age,gender,email,city,education,occupation,income_level,address,total_spend,purchase_frequency,avg_sentiment
0,1,Allison Hill,56,male,donaldgarcia@example.net,New Roberttown,Masters,Chief Financial Officer,High,"386 Shane Harbors\nPort Lindachester, MA 36922",421,2,0.66
1,2,Tyler Rogers,25,male,jamesmichael@example.com,Lindsaymouth,HighSchool,Software engineer,High,"84959 Janet Cape Apt. 413\nSouth Joshuastad, G...",4217,8,0.71
2,3,Michael Miles,36,non-binary,lynchgeorge@example.net,East Steven,Masters,Geophysicist/field seismologist,High,Unit 8350 Box 3056\nDPO AA 09176,3750,7,0.15
3,4,Tommy Walter,41,male,jason76@example.net,Thomasberg,PhD,Fine artist,High,"969 Cox Dam Suite 101\nLake Ernest, TX 55834",4343,6,0.63
4,5,Janice Carlson,39,male,jrice@example.org,Lake Nicoleview,Bachelors,"Librarian, public",Medium,"70482 Monica Hills Apt. 252\nNew Mariotown, DE...",4424,7,0.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Kristine Schmidt,24,male,mcbridemichael@example.org,West Erik,Bachelors,Diagnostic radiographer,Low,"289 Garrison Harbors\nSouth Kennethhaven, MT 6...",2215,4,0.69
496,497,Chad Hurley,55,male,jonesjeanette@example.net,Jasonland,PhD,Barrister's clerk,High,"72362 Myers Fields\nPort Michael, IN 36705",1376,3,0.42
497,498,James Barber,39,female,jamiethomas@example.com,Alexville,HighSchool,"Scientist, research (life sciences)",Low,"973 Evans Crossing\nHernandezmouth, FM 36166",5190,8,0.51
498,499,Amanda Massey,19,non-binary,daniel76@example.net,Lake Heather,PhD,"Radiographer, therapeutic",Low,"119 Allen Vista\nCopelandchester, MT 07926",4190,8,0.48


In [14]:
df_social

Unnamed: 0,customer_id,platform,post_text,sentiment_score,intent
0,1,Twitter,Security economic newspaper consider again tax.,0.66,Engagement
1,2,Twitter,Certain heart heavy fly.,0.71,Brand Awareness
2,3,Facebook,Opportunity position sure court crime.,0.15,Purchase Intent
3,4,LinkedIn,Realize staff dark.,0.63,Brand Awareness
4,5,Twitter,Family kind toward adult maybe.,0.49,Engagement
...,...,...,...,...,...
495,496,Twitter,Her benefit Mr network night garden.,0.69,Brand Awareness
496,497,Facebook,Drive probably born prevent allow yes.,0.42,Engagement
497,498,Instagram,Chair finish over like.,0.51,Brand Awareness
498,499,Instagram,Live weight guy.,0.48,Brand Awareness


#### Enhanced Social Media Data Processing

In [14]:
df_social

Unnamed: 0,customer_id,platform,post_text,sentiment_score,intent
0,1,Twitter,Security economic newspaper consider again tax.,0.66,Engagement
1,2,Twitter,Certain heart heavy fly.,0.71,Brand Awareness
2,3,Facebook,Opportunity position sure court crime.,0.15,Purchase Intent
3,4,LinkedIn,Realize staff dark.,0.63,Brand Awareness
4,5,Twitter,Family kind toward adult maybe.,0.49,Engagement
...,...,...,...,...,...
495,496,Twitter,Her benefit Mr network night garden.,0.69,Brand Awareness
496,497,Facebook,Drive probably born prevent allow yes.,0.42,Engagement
497,498,Instagram,Chair finish over like.,0.51,Brand Awareness
498,499,Instagram,Live weight guy.,0.48,Brand Awareness


In [15]:
# Calculate sentiment polarity (Positive/Negative/Neutral)
df_social['sentiment_polarity'] = np.where(
    df_social['sentiment_score'] > 0.7, 'Positive',
    np.where(df_social['sentiment_score'] < 0.3, 'Negative', 'Neutral'))
    
# Extract keywords from posts (simplified example)
df_social['keywords'] = df_social['post_text'].str.extract(r'#(\w+)')[0]

# Count social media activity per customer
social_activity = df_social.groupby('customer_id').agg(
    total_posts=('post_text', 'count'),
    avg_sentiment=('sentiment_score', 'mean'),
    dominant_intent=('intent', lambda x: x.mode()[0])
).reset_index()

#### Sentiment-Based Interest Mapping

In [17]:

# Define our interest mapping more robustly
interest_map = {
    'tech': 'Electronics',
    'food': 'Groceries',
    'fit': 'Fitness',
    'fashion': 'Fashion',
    'book': 'Books',
    'garden': 'Gardening',
    'travel': 'Travel'
}

# Function to categorize posts
def categorize_post(text):
    text = str(text).lower()
    for keyword, category in interest_map.items():
        if keyword in text:
            return category
    return 'Other'

# Apply categorization to social media data
df_social['inferred_category'] = df_social['post_text'].apply(categorize_post)

In [18]:
df_social['inferred_category'].unique

<bound method Series.unique of 0        Other
1        Other
2        Other
3        Other
4        Other
        ...   
495    Fitness
496      Other
497      Other
498      Other
499      Other
Name: inferred_category, Length: 500, dtype: object>

Preferred Category (One-Hot Encoding)
- Convert preference_category into binary columns.

In [19]:
pref_dummies = pd.get_dummies(df_pref['preference_category'], prefix='pref', dtype=int)
df_pref = pd.concat([df_pref, pref_dummies], axis=1)

In [21]:
# Create one-hot encoded columns for each category
social_interests = pd.get_dummies(df_social['inferred_category'], prefix='inferred_category', dtype=int)

In [22]:
social_interests

Unnamed: 0,inferred_category_Books,inferred_category_Electronics,inferred_category_Fitness,inferred_category_Gardening,inferred_category_Groceries,inferred_category_Other,inferred_category_Travel
0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...
495,0,0,1,0,0,0,0
496,0,0,0,0,0,1,0
497,0,0,0,0,0,1,0
498,0,0,0,0,0,1,0


In [23]:
# Sum by customer
social_interests = df_social.join(social_interests).groupby('customer_id').sum()


In [24]:
social_interests

Unnamed: 0_level_0,platform,post_text,sentiment_score,intent,sentiment_polarity,keywords,inferred_category,inferred_category_Books,inferred_category_Electronics,inferred_category_Fitness,inferred_category_Gardening,inferred_category_Groceries,inferred_category_Other,inferred_category_Travel
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,Twitter,Security economic newspaper consider again tax.,0.66,Engagement,Neutral,0,Other,0,0,0,0,0,1,0
2,Twitter,Certain heart heavy fly.,0.71,Brand Awareness,Positive,0,Other,0,0,0,0,0,1,0
3,Facebook,Opportunity position sure court crime.,0.15,Purchase Intent,Negative,0,Other,0,0,0,0,0,1,0
4,LinkedIn,Realize staff dark.,0.63,Brand Awareness,Neutral,0,Other,0,0,0,0,0,1,0
5,Twitter,Family kind toward adult maybe.,0.49,Engagement,Neutral,0,Other,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
496,Twitter,Her benefit Mr network night garden.,0.69,Brand Awareness,Neutral,0,Fitness,0,0,1,0,0,0,0
497,Facebook,Drive probably born prevent allow yes.,0.42,Engagement,Neutral,0,Other,0,0,0,0,0,1,0
498,Instagram,Chair finish over like.,0.51,Brand Awareness,Neutral,0,Other,0,0,0,0,0,1,0
499,Instagram,Live weight guy.,0.48,Brand Awareness,Neutral,0,Other,0,0,0,0,0,1,0


#### 4. Merging Social Data with Main Dataset

In [25]:
# Merge social activity metrics
df_merged = pd.merge(df_customer, social_activity, on='customer_id', how='left')


In [29]:
# Merge inferred interests from social media
#social_interests = df_social.groupby(['customer_id', 'inferred_interest']).size().unstack(fill_value=0)
#social_interests = df_social.groupby(['customer_id', 'inferred_interest'])
df_merged = pd.merge(df_merged, social_interests, on='customer_id', how='left')

In [32]:

# Fill NA values for social columns
#social_cols = ['total_posts', 'avg_sentiment', 'dominant_intent'] + list(interest_map.values())
#df_merged[social_cols] = df_merged[social_cols].fillna(0)
df_merged.columns

Index(['customer_id', 'name', 'age', 'gender', 'email', 'city', 'education',
       'occupation', 'income_level', 'address', 'total_spend',
       'purchase_frequency', 'avg_sentiment_x', 'total_posts',
       'avg_sentiment_y', 'dominant_intent', 'platform', 'post_text',
       'sentiment_score', 'intent', 'sentiment_polarity', 'keywords',
       'inferred_category', 'inferred_category_Books',
       'inferred_category_Electronics', 'inferred_category_Fitness',
       'inferred_category_Gardening', 'inferred_category_Groceries',
       'inferred_category_Other', 'inferred_category_Travel'],
      dtype='object')

#### 5. Merging All Data into a Single Dataset

In [33]:
# Merge customer + org data
df_merged = pd.merge(df_merged, df_org, on='customer_id', how='left')

# Merge with preferences
df_merged = pd.merge(df_merged, df_pref, on='customer_id', how='left')

# Add transaction aggregations (already merged earlier)
df_merged.head()

Unnamed: 0,customer_id,name,age,gender,email,city,education,occupation,income_level,address,...,timestamp_x,preference_category,preferred_brands,preferred_price_range,timestamp_y,pref_Books,pref_Electronics,pref_Fashion,pref_Fitness,pref_Groceries
0,1,Allison Hill,56,male,donaldgarcia@example.net,New Roberttown,Masters,Chief Financial Officer,High,"386 Shane Harbors\nPort Lindachester, MA 36922",...,2025-03-24 17:36:21.764912,Books,"majority, maintain, page",139-558,2025-03-24 17:37:52.181363,1,0,0,0,0
1,2,Tyler Rogers,25,male,jamesmichael@example.com,Lindsaymouth,HighSchool,Software engineer,High,"84959 Janet Cape Apt. 413\nSouth Joshuastad, G...",...,2025-03-24 17:36:21.802048,Electronics,"able, position, contain",249-873,2025-03-24 17:37:52.181363,0,1,0,0,0
2,3,Michael Miles,36,non-binary,lynchgeorge@example.net,East Steven,Masters,Geophysicist/field seismologist,High,Unit 8350 Box 3056\nDPO AA 09176,...,2025-03-24 17:36:21.804136,Groceries,"ok, game, between",302-786,2025-03-24 17:37:52.181363,0,0,0,0,1
3,4,Tommy Walter,41,male,jason76@example.net,Thomasberg,PhD,Fine artist,High,"969 Cox Dam Suite 101\nLake Ernest, TX 55834",...,2025-03-24 17:36:21.805081,Fashion,"certainly, success, heavy",108-849,2025-03-24 17:37:52.181363,0,0,1,0,0
4,5,Janice Carlson,39,male,jrice@example.org,Lake Nicoleview,Bachelors,"Librarian, public",Medium,"70482 Monica Hills Apt. 252\nNew Mariotown, DE...",...,2025-03-24 17:36:21.805081,Groceries,"responsibility, exactly, man",281-519,2025-03-24 17:37:52.182325,0,0,0,0,1


#### 6. Exporting the Final Dataset

In [34]:
from os import path
import os

file_path='SourceData'
if path.exists(file_path):
    df_merged.to_csv('SourceData/final_customer_dataset.csv', index=False)
else:
    print('final_customer_dataset.csv exists')

In [35]:
df_merged

Unnamed: 0,customer_id,name,age,gender,email,city,education,occupation,income_level,address,...,timestamp_x,preference_category,preferred_brands,preferred_price_range,timestamp_y,pref_Books,pref_Electronics,pref_Fashion,pref_Fitness,pref_Groceries
0,1,Allison Hill,56,male,donaldgarcia@example.net,New Roberttown,Masters,Chief Financial Officer,High,"386 Shane Harbors\nPort Lindachester, MA 36922",...,2025-03-24 17:36:21.764912,Books,"majority, maintain, page",139-558,2025-03-24 17:37:52.181363,1,0,0,0,0
1,2,Tyler Rogers,25,male,jamesmichael@example.com,Lindsaymouth,HighSchool,Software engineer,High,"84959 Janet Cape Apt. 413\nSouth Joshuastad, G...",...,2025-03-24 17:36:21.802048,Electronics,"able, position, contain",249-873,2025-03-24 17:37:52.181363,0,1,0,0,0
2,3,Michael Miles,36,non-binary,lynchgeorge@example.net,East Steven,Masters,Geophysicist/field seismologist,High,Unit 8350 Box 3056\nDPO AA 09176,...,2025-03-24 17:36:21.804136,Groceries,"ok, game, between",302-786,2025-03-24 17:37:52.181363,0,0,0,0,1
3,4,Tommy Walter,41,male,jason76@example.net,Thomasberg,PhD,Fine artist,High,"969 Cox Dam Suite 101\nLake Ernest, TX 55834",...,2025-03-24 17:36:21.805081,Fashion,"certainly, success, heavy",108-849,2025-03-24 17:37:52.181363,0,0,1,0,0
4,5,Janice Carlson,39,male,jrice@example.org,Lake Nicoleview,Bachelors,"Librarian, public",Medium,"70482 Monica Hills Apt. 252\nNew Mariotown, DE...",...,2025-03-24 17:36:21.805081,Groceries,"responsibility, exactly, man",281-519,2025-03-24 17:37:52.182325,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496,Kristine Schmidt,24,male,mcbridemichael@example.org,West Erik,Bachelors,Diagnostic radiographer,Low,"289 Garrison Harbors\nSouth Kennethhaven, MT 6...",...,2025-03-24 17:36:21.956271,Books,"star, doctor, our",49-588,2025-03-24 17:37:52.208343,1,0,0,0,0
496,497,Chad Hurley,55,male,jonesjeanette@example.net,Jasonland,PhD,Barrister's clerk,High,"72362 Myers Fields\nPort Michael, IN 36705",...,2025-03-24 17:36:21.956271,Groceries,"only, serve, buy",459-917,2025-03-24 17:37:52.208343,0,0,0,0,1
497,498,James Barber,39,female,jamiethomas@example.com,Alexville,HighSchool,"Scientist, research (life sciences)",Low,"973 Evans Crossing\nHernandezmouth, FM 36166",...,2025-03-24 17:36:21.957266,Books,"loss, everything, run",360-508,2025-03-24 17:37:52.208343,1,0,0,0,0
498,499,Amanda Massey,19,non-binary,daniel76@example.net,Lake Heather,PhD,"Radiographer, therapeutic",Low,"119 Allen Vista\nCopelandchester, MT 07926",...,2025-03-24 17:36:21.957266,Fashion,"page, child, treat",30-976,2025-03-24 17:37:52.208343,0,0,1,0,0


In [36]:
df_merged.columns

Index(['customer_id', 'name', 'age', 'gender', 'email', 'city', 'education',
       'occupation', 'income_level', 'address', 'total_spend',
       'purchase_frequency', 'avg_sentiment_x', 'total_posts',
       'avg_sentiment_y', 'dominant_intent', 'platform', 'post_text',
       'sentiment_score', 'intent', 'sentiment_polarity', 'keywords',
       'inferred_category', 'inferred_category_Books',
       'inferred_category_Electronics', 'inferred_category_Fitness',
       'inferred_category_Gardening', 'inferred_category_Groceries',
       'inferred_category_Other', 'inferred_category_Travel',
       'organization_name', 'industry', 'revenue', 'no_of_employees',
       'customer_role', 'timestamp_x', 'preference_category',
       'preferred_brands', 'preferred_price_range', 'timestamp_y',
       'pref_Books', 'pref_Electronics', 'pref_Fashion', 'pref_Fitness',
       'pref_Groceries'],
      dtype='object')

In [1]:
df_merged.shape

NameError: name 'df_merged' is not defined