In [3]:
from google.colab import drive
drive.mount('/content/gdrive/')


ValueError: mount failed

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

def load_data():
    customers = pd.read_csv('/content/gdrive/My Drive/Customers.csv')
    transactions = pd.read_csv('/content/gdrive/My Drive/Transactions.csv')
    return customers, transactions

def preprocess_data(customers, transactions):
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

    customer_metrics = transactions.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean'],
        'TransactionDate': lambda x: (x.max() - x.min()).days
    }).reset_index()


    customer_metrics.columns = ['CustomerID', 'transaction_count', 'total_spend',
                              'avg_transaction_value', 'total_quantity', 'avg_quantity',
                              'customer_lifetime']


    reference_date = customers['SignupDate'].max()
    customers['days_since_signup'] = (reference_date - customers['SignupDate']).dt.days

    region_dummies = pd.get_dummies(customers['Region'], prefix='region')

    features = customers[['CustomerID', 'days_since_signup']].merge(
        customer_metrics, on='CustomerID', how='left'
    )
    features = features.merge(region_dummies, left_index=True, right_index=True)

    features = features.fillna(0)

    return features

def find_optimal_clusters(features, max_clusters=10):
    scaled_features = StandardScaler().fit_transform(features.drop('CustomerID', axis=1))
    db_scores = []

    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        clusters = kmeans.fit_predict(scaled_features)
        db_score = davies_bouldin_score(scaled_features, clusters)
        db_scores.append(db_score)

    optimal_clusters = np.argmin(db_scores) + 2
    return optimal_clusters, db_scores

def perform_clustering(features, n_clusters):
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features.drop('CustomerID', axis=1))

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(scaled_features)

    clustered_data = features.copy()
    clustered_data['Cluster'] = clusters
    return clustered_data, kmeans, scaler
def visualize_clusters(clustered_data, features_to_plot=['total_spend', 'transaction_count']):
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        data=clustered_data,
        x=features_to_plot[0],
        y=features_to_plot[1],
        hue='Cluster',
        palette='deep'
    )
    plt.title('Customer Segments')
    plt.xlabel(features_to_plot[0].replace('_', ' ').title())
    plt.ylabel(features_to_plot[1].replace('_', ' ').title())
    plt.show()
def analyze_clusters(clustered_data):
    cluster_analysis = clustered_data.groupby('Cluster').agg({
        'total_spend': ['mean', 'count'],
        'transaction_count': 'mean',
        'avg_transaction_value': 'mean',
        'customer_lifetime': 'mean',
        'days_since_signup': 'mean'
    }).round(2)
    return cluster_analysis
def main():
    print("Loading data...")
    customers, transactions = load_data()
    print("Preprocessing data...")
    features = preprocess_data(customers, transactions)
    print("Finding optimal number of clusters...")
    optimal_clusters, db_scores = find_optimal_clusters(features)
    print(f"Optimal number of clusters: {optimal_clusters}")
    print(f"DB Index scores: {[round(score, 3) for score in db_scores]}")
    print("\nPerforming clustering...")
    clustered_data, kmeans, scaler = perform_clustering(features, optimal_clusters)
    print("\nGenerating visualizations...")
    visualize_clusters(clustered_data)
    print("\nAnalyzing clusters...")
    cluster_analysis = analyze_clusters(clustered_data)
    print("\nCluster Analysis:")
    print(cluster_analysis)
    clustered_data.to_csv('/content/gdrive/My Drive/customer_segments.csv', index=False)
    print("\nResults saved to 'customer_segments.csv'")
if __name__ == "__main__":
    main()

KeyboardInterrupt: 