<a href="https://colab.research.google.com/github/glombardo/Research/blob/main/Hidden_Tribes_Audience_Tool.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import pandas as pd, numpy as np
import plotly.express as px, plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
import umap, hdbscan
from mlxtend.frequent_patterns import apriori, association_rules
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [132]:
def clean_data_and_explore(path, audience, min_sample_per_q, max_index_per_q):
    df = pd.read_csv(path, encoding='latin-1')
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[%()]', '', regex=True)
    df.columns = df.columns.str.replace(' ', '_')

    req_index = 'index' + '_' + audience
    req_pct = 'vertical' + '_' + audience
    req_sample = 'sample' + '_' + audience

    required_columns = ['category_tier_1', 'category_tier_2', 'category_tier_3', 'question', 'answer', req_index, req_pct, req_sample]
    df_cleaned = df.dropna(subset=required_columns).copy()
    df_cleaned = df_cleaned[df_cleaned[req_sample] > min_sample_per_q]
    df_cleaned = df_cleaned[df_cleaned[req_index] < max_index_per_q]
    df_filtered = df_cleaned[
        (df_cleaned['weighted000_agc_and_start_a_business'].notna()) &
        (df_cleaned['weighted000_agc_and_start_a_business'] != 0) &
        (df_cleaned['index_agc_and_start_a_business'].notna()) &
        (df_cleaned['index_agc_and_start_a_business'] != 0)
    ].copy()


    print(f"Rows: {df_filtered.shape[0]:,}  |  Cols: {df_filtered.shape[1]}")
    df_filtered['cat_path'] = df_filtered['category_tier_1'] + ' / ' + df_filtered['category_tier_2'] + ' / ' + df_filtered['category_tier_3']
    df_filtered['ques_cat'] = df_cleaned['category_tier_1'] + ' / ' + df_filtered['category_tier_2'] + ' / ' + df_filtered['category_tier_3'] + ' / ' + df_filtered['question']
    df_filtered['ans_ques_cat'] = df_filtered['category_tier_1'] + ' / ' + df_filtered['category_tier_2'] + ' / ' + df_filtered['category_tier_3'] + ' / ' + df_filtered['question'] + ' / ' + df_filtered['answer']
    #display(df_filtered.head())
    #display(df_filtered.describe(include='all').T[['unique','top','freq']].head(10))

    # Plotly histogram
    fig = px.histogram(
        df_filtered,
        x=req_index,
        nbins=50,
        title='Index Distribution (100 = population average)',
        labels={req_index: 'Index'},
        histnorm='density'
    )

    fig.add_vline(x=100, line_dash="dash", line_color="red")

    fig.update_layout(
        xaxis_title='Index',
        yaxis_title='Density',
        font=dict(family='Trebuchet MS', size=14)
    )

    fig.show()

    return df_filtered


def plot_sunburst_chart(df_a, audience):
    req_index = 'index' + '_' + audience
    req_weight = 'weighted000' + '_' + audience

    sun = px.sunburst(
        df_a,
        path=['category_tier_1','category_tier_2','category_tier_3'],
        values=req_weight,
        color=req_index,
        color_continuous_scale='RdYlGn',
        color_continuous_midpoint=100,
        hover_data={
            req_index: ':.0f',
            req_weight: ':,.0f'
        }
    )

    sun.update_layout(
        title='Hidden Tribes: Over-Indexed Topics',
        height=750,
        font=dict(family='Trebuchet MS', size=14)
    )

    return sun.show()


In [133]:
path = '/content/drive/My Drive/dataprojects/audience_dataset_mass_survey.csv'
audience = 'agc_and_graduate_from_school'
data_works = clean_data_and_explore(path, audience,50,300)
plot_sunburst_chart(data_works, audience)

Rows: 14,004  |  Cols: 40


In [None]:
def get_segment_details(cat1):
    seg = (df_filtered.query('Category1 == @cat1')
             .sort_values('Index', ascending=False)
             .head(10)[['Question','Answer','Index','Weighted']])
    return seg

# Example
get_segment_details('Lifestyle & Values')

In [146]:
def get_clusters(data_b, audience, category_1 = 'all'):
  if category_1 == 'all':
    data_works_a = data_b
  else:
    data_works_a = data_b[data_b['category_tier_1'] == category_1]

  req_index = 'index' + '_' + audience

  #create pivot "questions and answers" with log_odds
  #build a matrix s.t rows -> questions, cols -> answers, and values -> log offs for that question/answer combination.
  #encodes behavioral preferences/intensities of question/answer groups. This matrix will be fed into UUMAP and HDBSCAN.

  data_works_a['log_odds'] = np.log((data_works_a[req_index]+1e-3) / 100)
  pivot = data_works_a.pivot_table(index='ques_cat',
                        columns='answer',
                        values='log_odds',
                        aggfunc='mean').fillna(0)
  #print(pivot.shape)

  scaler = StandardScaler()
  embed = umap.UMAP(n_neighbors=25, min_dist=0.3, random_state=42)
  X_umap = embed.fit_transform(scaler.fit_transform(pivot))
  #X_umap

  clusterer = hdbscan.HDBSCAN(min_cluster_size=15)
  labels = clusterer.fit_predict(X_umap)
  pivot['cluster'] = labels
  #print(np.bincount(labels + 1))   # cluster counts (-1 = noise)
  umap_df = pd.DataFrame(X_umap, columns=['u1','u2'],
                       index=pivot.index).reset_index()
  umap_df['cluster'] = labels
  fig = px.scatter(umap_df, x='u1', y='u2',
                 color='cluster',
                 hover_name='ques_cat',
                 title='UMAP clusters of survey items')
  fig.update_layout(height=600)
  return fig.show()


In [135]:
data_works.head()

Unnamed: 0,category_tier_1,category_tier_2,category_tier_3,question,answer,sample_base,weighted000_base,vertical_base,horizontal_base,index_base,...,horizontal_agc_and_graduate_from_school,index_agc_and_graduate_from_school,sample_agc_and_start_a_business,weighted000_agc_and_start_a_business,vertical_agc_and_start_a_business,horizontal_agc_and_start_a_business,index_agc_and_start_a_business,cat_path,ques_cat,ans_ques_cat
26,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Aerobic/Fitness shoes,1,9950,22994.37,0.090698,1,100,...,0.013466,158,187,378.41,0.132543,0.016457,146,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...
27,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Aerobic/Fitness shoes,2,4406,10853.01,0.042808,1,100,...,0.01027,121,97,196.99,0.068998,0.018151,161,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...
29,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Aerobic/Fitness shoes,Any,16779,39819.42,0.157061,1,100,...,0.013589,160,368,718.82,0.251776,0.018052,160,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...
37,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Basketball shoes,Any,4057,9949.88,0.039246,1,100,...,0.014119,166,137,316.02,0.11069,0.031761,282,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...
58,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Running/Jogging shoes,1,10628,24441.26,0.096405,1,100,...,0.01053,124,177,354.89,0.124305,0.01452,129,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...


In [148]:
get_clusters(data_works, audience,'Household Products')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



In [142]:
data_works.head()

Unnamed: 0,category_tier_1,category_tier_2,category_tier_3,question,answer,sample_base,weighted000_base,vertical_base,horizontal_base,index_base,...,index_agc_and_graduate_from_school,sample_agc_and_start_a_business,weighted000_agc_and_start_a_business,vertical_agc_and_start_a_business,horizontal_agc_and_start_a_business,index_agc_and_start_a_business,cat_path,ques_cat,ans_ques_cat,log_odds
26,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Aerobic/Fitness shoes,1,9950,22994.37,0.090698,1,100,...,158,187,378.41,0.132543,0.016457,146,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,0.457431
27,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Aerobic/Fitness shoes,2,4406,10853.01,0.042808,1,100,...,121,97,196.99,0.068998,0.018151,161,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,0.190629
29,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Aerobic/Fitness shoes,Any,16779,39819.42,0.157061,1,100,...,160,368,718.82,0.251776,0.018052,160,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,0.47001
37,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Basketball shoes,Any,4057,9949.88,0.039246,1,100,...,166,137,316.02,0.11069,0.031761,282,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,0.506824
58,Apparel/Accessories,Athletic Shoes,Number Of Pairs Bought in last 12 months,Running/Jogging shoes,1,10628,24441.26,0.096405,1,100,...,124,177,354.89,0.124305,0.01452,129,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,Apparel/Accessories / Athletic Shoes / Number ...,0.215119


In [164]:

def assoc_rule_apriori(data_b, category_1):
  if category_1 == 'all':
    data_works_a = data_b
  else:
    data_works_a = data_b[data_b['category_tier_1'] == category_1]

  basket = (data_works[['question','answer']]
            .drop_duplicates()
            .assign(flag=1)
            .pivot_table(index='question',
                        columns='answer',
                        values='flag',
                        fill_value=0))
  freq = apriori(basket, min_support=0.04, use_colnames=True)
  rules = association_rules(freq, metric='lift', min_threshold=1.2)\
            .sort_values('confidence', ascending=False)
  return rules.head(10)

In [165]:
assoc_rule_apriori(data_works, "Household Products")


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type



Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
3,(Secondary),(Total Users),0.10346,0.549935,0.10346,1.0,1.818398,1.0,0.046564,inf,0.502002,0.188131,1.0,0.594065
20,"(Secondary, Sole/Primary)",(Total Users),0.082898,0.549935,0.082898,1.0,1.818398,1.0,0.03731,inf,0.490747,0.150742,1.0,0.575371
76,"(Volume, Sole Users, Sole/Primary)",(Total Users),0.164817,0.549935,0.164817,1.0,1.818398,1.0,0.074179,inf,0.538882,0.299703,1.0,0.649852
74,"(Volume, Total Users, Sole Users)",(Sole/Primary),0.164817,0.412206,0.164817,1.0,2.42597,1.0,0.096879,inf,0.703791,0.399842,1.0,0.699921
62,"(Volume, Secondary, Sole/Primary)",(Total Users),0.066253,0.549935,0.066253,1.0,1.818398,1.0,0.029818,inf,0.481999,0.120475,1.0,0.560237
36,"(Total Users, Sole Users)",(Sole/Primary),0.215078,0.412206,0.215078,1.0,2.42597,1.0,0.126422,inf,0.748857,0.521774,1.0,0.760887
38,"(Sole Users, Sole/Primary)",(Total Users),0.215078,0.549935,0.215078,1.0,1.818398,1.0,0.096799,inf,0.573389,0.391098,1.0,0.695549
31,"(Volume, Secondary)",(Total Users),0.082245,0.549935,0.082245,1.0,1.818398,1.0,0.037016,inf,0.490398,0.149555,1.0,0.574777
9,(Sole Users),(Total Users),0.215405,0.549935,0.215078,0.998485,1.815642,1.0,0.09662,297.043081,0.572564,0.390866,0.996633,0.694791
40,(Sole Users),"(Total Users, Sole/Primary)",0.215405,0.410574,0.215078,0.998485,2.431922,1.0,0.126639,389.020888,0.750454,0.523431,0.997429,0.761166
