# Integrated Data Analysis Demo

This notebook demonstrates a complete workflow using pandas, matplotlib, and seaborn.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Create directories for saving plots and data if they don't exist
if not os.path.exists('plots'):
    os.makedirs('plots')
if not os.path.exists('data'):
    os.makedirs('data')

# Set the seaborn style for all plots
sns.set_theme(style="whitegrid")

## Step 1: Generate a synthetic dataset (simulating real-world data)

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 200

# Create a DataFrame with synthetic data
data = {
    # Customer demographics
    'customer_id': range(1, n_samples + 1),
    'age': np.random.randint(18, 80, n_samples),
    'gender': np.random.choice(['Male', 'Female'], n_samples),
    'income': np.random.normal(50000, 15000, n_samples),
    
    # Purchase behavior
    'purchase_frequency': np.random.randint(1, 30, n_samples),
    'avg_purchase_value': np.random.normal(100, 50, n_samples),
    
    # Product preferences (categories A through E)
    'product_category': np.random.choice(['A', 'B', 'C', 'D', 'E'], n_samples),
    
    # Customer satisfaction
    'satisfaction_score': np.random.randint(1, 11, n_samples),
    
    # Customer tenure (in months)
    'tenure_months': np.random.randint(1, 60, n_samples)
}

# Create the DataFrame
df = pd.DataFrame(data)

In [None]:
# Add some realistic correlations
# Higher income tends to correlate with higher purchase value
df['avg_purchase_value'] = df['avg_purchase_value'] + df['income'] * 0.0003 + np.random.normal(0, 20, n_samples)

# Longer tenure tends to correlate with higher satisfaction
df['satisfaction_score'] = df['satisfaction_score'] + df['tenure_months'] * 0.03 + np.random.normal(0, 1, n_samples)
df['satisfaction_score'] = df['satisfaction_score'].clip(1, 10).round().astype(int)

# Add a calculated field: total spend
df['total_spend'] = df['purchase_frequency'] * df['avg_purchase_value']

# Add some missing values to simulate real-world data
indices = np.random.choice(n_samples, 20, replace=False)
df.loc[indices, 'income'] = np.nan

indices = np.random.choice(n_samples, 15, replace=False)
df.loc[indices, 'satisfaction_score'] = np.nan

In [None]:
# Save the raw data
df.to_csv('data/customer_data_raw.csv', index=False)
print(f"Raw data saved to 'data/customer_data_raw.csv' ({df.shape[0]} rows, {df.shape[1]} columns)")

# Display the first few rows
df.tail(3)

## Step 2: Data Exploration and Cleaning

In [None]:
# Display basic information about the dataset
print("Dataset Overview:")
print(f"Shape: {df.shape}")

print("\nData Types:")
print(df.dtypes)

In [None]:
# Summary statistics
print("Summary Statistics:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

In [None]:
# Clean the data
print("Cleaning the data...")

# Fill missing income values with median
median_income = df['income'].median()
df['income'] = df['income'].fillna(median_income)

# Fill missing satisfaction scores with median
median_satisfaction = df['satisfaction_score'].median()
df['satisfaction_score'] = df['satisfaction_score'].fillna(median_satisfaction)
df['satisfaction_score'] = df['satisfaction_score'].round().astype(int)

# Check if all missing values are handled
print("\nRemaining Missing Values:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0] if any(missing_values > 0) else "No missing values remaining")

In [None]:
# Save the cleaned data
df.to_csv('data/customer_data_cleaned.csv', index=False)
print("Cleaned data saved to 'data/customer_data_cleaned.csv'")

## Step 3: Exploratory Data Analysis with Visualizations

### 1. Distribution of Age

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['age'], kde=True, bins=20)
plt.title('Distribution of Customer Age', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.savefig('plots/age_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

### 2. Gender Distribution

In [None]:
plt.figure(figsize=(8, 6))
gender_counts = df['gender'].value_counts()
sns.barplot(x=gender_counts.index, y=gender_counts.values)
plt.title('Gender Distribution', fontsize=14)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.savefig('plots/gender_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

### 3. Income Distribution by Gender

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='gender', y='income', data=df)
plt.title('Income Distribution by Gender', fontsize=14)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Income', fontsize=12)
plt.savefig('plots/income_by_gender.png', dpi=300, bbox_inches='tight')
plt.show()

### 4. Satisfaction Score Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x='satisfaction_score', data=df, order=range(1, 11))
plt.title('Distribution of Satisfaction Scores', fontsize=14)
plt.xlabel('Satisfaction Score', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.savefig('plots/satisfaction_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

### 5. Relationship between Income and Purchase Value

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='income', y='avg_purchase_value', hue='gender', data=df)
plt.title('Relationship between Income and Average Purchase Value', fontsize=14)
plt.xlabel('Income', fontsize=12)
plt.ylabel('Average Purchase Value', fontsize=12)
plt.savefig('plots/income_vs_purchase.png', dpi=300, bbox_inches='tight')
plt.show()

### 6. Product Category Distribution

In [None]:
plt.figure(figsize=(10, 6))
category_counts = df['product_category'].value_counts().sort_index()
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.title('Product Category Distribution', fontsize=14)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.savefig('plots/product_category_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

### 7. Product Category by Gender

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(x='product_category', hue='gender', data=df)
plt.title('Product Category Preference by Gender', fontsize=14)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Gender')
plt.savefig('plots/product_by_gender.png', dpi=300, bbox_inches='tight')
plt.show()

### 8. Correlation Heatmap

In [None]:
# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=['int64', 'float64'])

# Calculate correlation matrix
corr_matrix = numeric_df.corr()

# Create heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Variables', fontsize=14)
plt.savefig('plots/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

### 9. Relationship between Tenure and Satisfaction

In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(x='tenure_months', y='satisfaction_score', data=df, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title('Relationship between Customer Tenure and Satisfaction', fontsize=14)
plt.xlabel('Tenure (months)', fontsize=12)
plt.ylabel('Satisfaction Score', fontsize=12)
plt.savefig('plots/tenure_vs_satisfaction.png', dpi=300, bbox_inches='tight')
plt.show()

### 10. Age Distribution by Gender

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='age', hue='gender', element='step', stat='density', common_norm=False)
plt.title('Age Distribution by Gender', fontsize=14)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.savefig('plots/age_by_gender.png', dpi=300, bbox_inches='tight')
plt.show()

## Step 4: Advanced Analysis

### 1. Customer Segmentation by Purchase Behavior

In [None]:
# Create customer segments based on purchase frequency and average purchase value
df['frequency_segment'] = pd.qcut(df['purchase_frequency'], 3, labels=['Low', 'Medium', 'High'])
df['value_segment'] = pd.qcut(df['avg_purchase_value'], 3, labels=['Low', 'Medium', 'High'])

# Create a combined segment
df['customer_segment'] = df['frequency_segment'].astype(str) + '-' + df['value_segment'].astype(str)

# Display the distribution of segments
plt.figure(figsize=(12, 6))
segment_counts = df['customer_segment'].value_counts()
sns.barplot(x=segment_counts.index, y=segment_counts.values)
plt.title('Customer Segments by Purchase Frequency and Value', fontsize=14)
plt.xlabel('Segment (Frequency-Value)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.savefig('plots/customer_segments.png', dpi=300, bbox_inches='tight')
plt.show()

### 2. Average Satisfaction by Product Category and Gender

In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(x='product_category', y='satisfaction_score', hue='gender', data=df)
plt.title('Average Satisfaction Score by Product Category and Gender', fontsize=14)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Average Satisfaction Score', fontsize=12)
plt.legend(title='Gender')
plt.savefig('plots/satisfaction_by_category_gender.png', dpi=300, bbox_inches='tight')
plt.show()

### 3. Total Spend Analysis

In [None]:
# Create spend categories
df['spend_category'] = pd.qcut(df['total_spend'], 4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'])

# Analyze spend by gender
plt.figure(figsize=(10, 6))
spend_by_gender = df.groupby('gender')['total_spend'].mean().reset_index()
sns.barplot(x='gender', y='total_spend', data=spend_by_gender)
plt.title('Average Total Spend by Gender', fontsize=14)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Average Total Spend', fontsize=12)
plt.savefig('plots/spend_by_gender.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Analyze spend by age group
df['age_group'] = pd.cut(df['age'], bins=[18, 30, 45, 60, 80], labels=['18-30', '31-45', '46-60', '61+'])

plt.figure(figsize=(10, 6))
spend_by_age = df.groupby('age_group')['total_spend'].mean().reset_index()
sns.barplot(x='age_group', y='total_spend', data=spend_by_age)
plt.title('Average Total Spend by Age Group', fontsize=14)
plt.xlabel('Age Group', fontsize=12)
plt.ylabel('Average Total Spend', fontsize=12)
plt.savefig('plots/spend_by_age.png', dpi=300, bbox_inches='tight')
plt.show()

## Step 5: Insights and Conclusions

Based on our analysis, we can draw the following insights:

1. **Customer Demographics**: Our customer base has a balanced gender distribution with a wide age range from 18 to 79 years old.

2. **Purchase Behavior**: There's a positive correlation between income and average purchase value, suggesting that higher-income customers tend to spend more per purchase.

3. **Customer Satisfaction**: Longer-tenured customers generally report higher satisfaction scores, indicating that customer loyalty may be linked to positive experiences.

4. **Product Preferences**: Product category preferences vary by gender, with certain categories showing stronger appeal to specific gender groups.

5. **Customer Segmentation**: The segmentation analysis reveals distinct customer groups based on purchase frequency and value, which can be targeted with different marketing strategies.

6. **Spending Patterns**: Total customer spend varies significantly across age groups and gender, with the highest average spend observed in the 46-60 age group.

These insights can inform marketing strategies, product development, and customer retention efforts.