In [7]:
import numpy as np
import pandas as pd

# Generate random user profiles
np.random.seed(42)
num_users = 100
names = ['John', 'Jane', 'Bob', 'Alice', 'Charlie', 'David', 'Emily', 'Frank', 'Grace', 'Henry']
last_names = ['Smith', 'Johnson', 'Williams', 'Jones', 'Brown', 'Davis', 'Miller', 'Wilson', 'Moore', 'Taylor']
jobs = ['software engineer', 'data analyst', 'marketing manager', 'sales representative', 'teacher', 'doctor', 'chef', 'journalist', 'artist', 'athlete']
cities = ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Seattle', 'Boston', 'Austin', 'Miami', 'Denver', 'Atlanta']
purposes = ['vacation', 'business', 'family visit', 'adventure', 'cultural', 'religious', 'honeymoon', 'wellness', 'sports', 'education']
companions = ['alone', 'spouse/partner', 'family', 'friends', 'colleagues', 'other']

names_list = [np.random.choice(names) + ' ' + np.random.choice(last_names) for i in range(num_users)]
mails = [name.lower().replace(' ', '.') + '@example.com' for name in names_list]
birth_dates = [np.random.randint(10, 70) for i in range(num_users)]
job_list = [np.random.choice(jobs) for i in range(num_users)]
city_list = [np.random.choice(cities) for i in range(num_users)]
freq_list = [np.random.randint(0, 14) for i in range(num_users)]
purpose_list = [np.random.choice(purposes) for i in range(num_users)]
companion_list = [np.random.choice(companions) for i in range(num_users)]

# Create a pandas DataFrame to store the user profiles
df = pd.DataFrame({'name': names_list, 'mail': mails, 'birth_date': birth_dates,
                   'job': job_list, 'city': city_list, 'travel_frequency': freq_list,
                   'travel_purpose': purpose_list, 'travel_companions': companion_list})

# Print the first few rows of the DataFrame
df.head()


Unnamed: 0,name,mail,birth_date,job,city,travel_frequency,travel_purpose,travel_companions
0,Emily Jones,emily.jones@example.com,31,sales representative,Chicago,6,wellness,alone
1,Frank Brown,frank.brown@example.com,20,journalist,Boston,3,education,spouse/partner
2,Emily Taylor,emily.taylor@example.com,57,journalist,Chicago,9,sports,alone
3,Bob Miller,bob.miller@example.com,25,chef,Los Angeles,4,business,family
4,Frank Brown,frank.brown@example.com,42,marketing manager,Austin,1,education,colleagues


In [10]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Select the columns to cluster on
columns_to_cluster = ['birth_date', 'travel_frequency']

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[columns_to_cluster])

# Perform k-means clustering with 4 clusters
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(scaled_data)

# Add the cluster labels to the DataFrame
df['cluster'] = kmeans.labels_

# Print the first few rows of the DataFrame with the cluster labels
df.head()



Unnamed: 0,name,mail,birth_date,job,city,travel_frequency,travel_purpose,travel_companions,cluster
0,Emily Jones,emily.jones@example.com,31,sales representative,Chicago,6,wellness,alone,2
1,Frank Brown,frank.brown@example.com,20,journalist,Boston,3,education,spouse/partner,2
2,Emily Taylor,emily.taylor@example.com,57,journalist,Chicago,9,sports,alone,1
3,Bob Miller,bob.miller@example.com,25,chef,Los Angeles,4,business,family,2
4,Frank Brown,frank.brown@example.com,42,marketing manager,Austin,1,education,colleagues,3


In [None]:
# Cluster the scaled data using KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_data)

# Add cluster labels to the DataFrame
df['cluster'] = kmeans.labels_

# Print the first few rows of the DataFrame
print(df.head())