In [None]:
import pandas as pd
import pyreadstat
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif

import plotly.express as px
import plotly.io as pio

pd.options.plotting.backend = "plotly"
pio.templates.default = "plotly_white"

# Load data

In [None]:
out_git_repo = '/Users/annie.quinn/git/ford-hackathon/personal/annie/AO21/' #####
spec_file_name_w1 = 'AO21 SPSS Spec v8.5.xlsx'
spec_file_name_w2 = 'AO21_w2 SPSS Spec v1.xlsx'
data_file_name_w1 = 'AO21W1_US.csv'
data_file_name_w2 = 'AO21W2_US_V1_20220121_IDs-2.csv'
data_map_file_name = 'levels_Dictionary AO21_W1_UK_V1.xlsx'

In [None]:
# Focus on W1, largest car buying population with complete survey responses

In [None]:
inputData_path = out_git_repo #####

In [None]:
data_map = pd.read_excel(inputData_path + data_map_file_name, sheet_name='Variable Information')

In [None]:
spec_file_w1 = pd.read_excel(inputData_path + spec_file_name_w1, sheet_name='SPSS spec')
spec_file_w2 = pd.read_excel(inputData_path + spec_file_name_w2, sheet_name='SPSS spec')

In [None]:
labels_w1 = list(spec_file_w1['Label'])
labels_w2 = list(spec_file_w2['Label'])

In [None]:
data_w1 = pd.read_csv(inputData_path + data_file_name_w1)
data_w2 = pd.read_csv(inputData_path + data_file_name_w2)

In [None]:
#don't count na as a unique value
data_w1 = data_w1.replace({" ": None})
data_w2 = data_w2.replace({" ": None})
data_w1 = data_w1.apply(pd.to_numeric, errors='ignore')
data_w2 = data_w2.apply(pd.to_numeric, errors='ignore')

In [None]:
data_w1_bought_car = data_w1.loc[((data_w1["PUR03_2_46"] == 1) | (data_w1["PUR03_2_53"] == 1)) & (data_w1["HV_Section"] == 2)]
data_w1_bought_car

# Clean labels

In [None]:
labels = data_map[['Variable', 'Label']]

In [None]:
# Use str.split and str.join and astype
#labels['Cleaned label'] = labels['Label'].str.split('|')
labels.loc['Cleaned label'] = labels['Label'].str.split('|')

# Create file of car variables

In [None]:
car_cols_w1 = spec_file_w1[spec_file_w1['Label'].str.contains("car", case=False, na=False)]
car_cols_w2 = spec_file_w2[spec_file_w2['Label'].str.contains("car", case=False, na=False)]

In [None]:
#elimniate variables related to skincare and financials (typically loans which maybe for a car or car insurance)
car_cols_w1 = car_cols_w1[~car_cols_w1['Label'].str.contains(('care|financials|petrol'), case=False, na=False)]
car_cols_w2 = car_cols_w2[~car_cols_w2['Label'].str.contains(('care|financials|petrol'), case=False, na=False)]
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(car_cols_w1.shape)
    print(car_cols_w2.shape)

In [None]:
car_purchase_w1 = car_cols_w1[car_cols_w1['Label'].str.contains(('purchase|bought'), case=False, na=False)]
car_purchase_w2 = car_cols_w2[car_cols_w2['Label'].str.contains(('purchase|bought'), case=False, na=False)]

In [None]:
new_car_w1 = car_purchase_w1[car_purchase_w1['Label'].str.contains('new', case=False, na=False)]
new_car_w1

In [None]:
new_car_w2 = car_purchase_w2[car_purchase_w2['Label'].str.contains('new', case=False, na=False)]
new_car_w2

In [None]:
used_car_w1 = car_purchase_w1[car_purchase_w1['Label'].str.contains('second', case=False, na=False)]
used_car_w1

In [None]:
used_car_w2 = car_purchase_w2[car_purchase_w2['Label'].str.contains('second', case=False, na=False)]
used_car_w2

In [None]:
#details on age of car bought not specified
just_car_w1 = car_purchase_w1[~car_purchase_w1['Label'].str.contains(('new|second|insurance'), case=False, na=False)]
just_car_w1

In [None]:
#details on age of car bought not specified
just_car_w2 = car_purchase_w2[~car_purchase_w2['Label'].str.contains(('new|second|insurance'), case=False, na=False)]
just_car_w2

In [None]:
online_purchase_w1 = car_purchase_w1[car_purchase_w1['Label'].str.contains('online', case=False, na=False)]
online_purchase_w1

In [None]:
online_purchase_w2 = car_purchase_w2[car_purchase_w2['Label'].str.contains('online', case=False, na=False)]
online_purchase_w2

# Touchpoints

In [None]:
car_touchpoints_w1 = car_cols_w1[car_cols_w1['Label'].str.contains("Touchpoints")]
car_touchpoints_w2 = car_cols_w2[car_cols_w2['Label'].str.contains("Touchpoints")]

In [None]:
sorted(car_touchpoints_w1) == sorted(car_touchpoints_w2)

In [None]:
car_touchpoints = car_touchpoints_w1

In [None]:
touchpoints_helped = car_touchpoints[car_touchpoints_w1['Label'].str.contains("helped")]
touchpoints_helped

In [None]:
touchpoints_noticed = car_touchpoints[car_touchpoints['Label'].str.contains("noticed")]
touchpoints_noticed

In [None]:
touchpoints_helped_data_w1 = reduced_data_w1[list(touchpoints_helped['Variable new'])]
touchpoints_helped_data_w2 = reduced_data_w2[list(touchpoints_helped['Variable new'])]

In [None]:
touchpoints_noticed_data_w1 = reduced_data_w1[list(touchpoints_noticed['Variable new'])]
touchpoints_noticed_data_w2 = reduced_data_w2[list(touchpoints_noticed['Variable new'])]

In [None]:
touchpoints_noticed_data_w1

nan density for tp considering filters

In [None]:
car_touchpoints_data_w1 = reduced_data_w1[list(car_touchpoints['Variable new'])]
car_touchpoints_data_w2 = reduced_data_w2[list(car_touchpoints['Variable new'])]

In [None]:
# check the values of 
# each row for each column
car_touchpoints_data_w1.agg(['size', 'count', 'nunique'])

In [None]:
car_touchpoints_data_w2.agg(['size', 'count', 'nunique'])

In [None]:
touchpoints_noticed_data_w1.apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False)

In [None]:
touchpoints_helped_data_w1.apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False)

In [None]:
#(car_touchpoints_data_w2.apply(pd.value_counts)).sort_values(by = 1.0, axis = 1, ascending=False)

In [None]:
not_selected = list((touchpoints_noticed_data_w1.apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_noticed_data_w1.apply(pd.value_counts)).iloc[1])
index = (touchpoints_noticed_data_w1.apply(pd.value_counts)).columns
df = pd.DataFrame({'Not selected': not_selected,
                   'Selected': selected}, index=index)
ax = df.plot.bar(stacked=True)
plt.title("Touchpoints noticed")
plt.legend()


plt.savefig('touchpoints_noticed.pdf', bbox_inches='tight')
plt.show()

In [None]:
data_w1["dummy"] = 0 # For instances with no grouping variable

def pop_across_cols(data_w1, cols, col_multiplier="OA_Projection", cols_grp = ["Q02"]):
    
    cols_grp = ["dummy", *cols_grp]

    # Get idx of rows with data and subset
    idx = data_w1[cols].dropna(how="all").index.values
    subset = data_w1.copy().loc[idx, [*cols_grp, *cols]]

    # Get population multiplier and apply
    multiplier = data_w1.loc[idx, col_multiplier].values
    multiplier = multiplier.reshape(len(multiplier),1)

    # Sum pop per var per group
    subset[cols] = (subset[cols] * multiplier)
    pop_summed = subset.groupby(cols_grp)[cols].sum().reset_index()

    # Sum pop per group
    pop = data_w1.loc[idx, [col_multiplier, *cols_grp]]
    pop = pop.groupby(cols_grp)[col_multiplier].sum().reset_index()
    pop_divisor_by_grp = pop[col_multiplier].values
    
    # Output
    pop_summed["pop"] = pop_divisor_by_grp
    output = pop_summed.melt(id_vars=[*cols_grp, "pop"], value_name="pop_selected")
    output.drop(columns="dummy", inplace=True)

    return output

pop_across_cols(data_w1_bought_car, cols = touchpoints_noticed['Variable new'])
pop_across_cols(data_w1_bought_car, cols = touchpoints_helped['Variable new'])

In [None]:
data_map = data_map.dropna()
data_map.loc[data_map["Label"].str.startswith("Category purchase | Last 12 months | New car") | data_map["Label"].str.startswith("Category purchase | Last 12 months | Second hand car")]

In [None]:
(data_map[data_map['Variable'] == 'TP02_046_001']).Label

In [None]:
not_selected = list((touchpoints_helped_data_w1.apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_helped_data_w1.apply(pd.value_counts)).iloc[1])
index = (touchpoints_helped_data_w1.apply(pd.value_counts)).columns
df = pd.DataFrame({'Not selected': not_selected,
                   'Selected': selected}, index=index)
ax = df.plot.bar(stacked=True)
plt.title("Touchpoints helped")

plt.savefig('touchpoints_helped.pdf', bbox_inches='tight')
plt.show()

In [None]:
(data_map[data_map['Variable'] == 'TP03_046_010']).Label

# Average spend

In [None]:
#Note: average spend in wave 1 is in TOOLS split
average_spend_w1 = car_cols_w1[car_cols_w1['Label'].str.contains("spend", case=False, na=False)]
average_spend_w1 = average_spend_w1[~average_spend_w1['Label'].str.contains('selection', case=False, na=False)]
#Note: average spend in wave 2 is in E-COMMERCE DEEP DIVE
average_spend_w2 = car_cols_w2[car_cols_w2['Label'].str.contains("spend", case=False, na=False)]
average_spend_w2 = average_spend_w2[~average_spend_w2['Label'].str.contains('selection', case=False, na=False)]

In [None]:
average_spend_data_w1 = reduced_data_w1[list(average_spend_w1['Variable new'])]
average_spend_data_w2 = reduced_data_w2[list(average_spend_w2['Variable new'])]

In [None]:
#Note: average spend is continuous
average_spend_data_w1.agg(['size', 'count', 'nunique'])

In [None]:
selected_rows = average_spend_data_w1.PUR04_46[~(average_spend_data_w1.PUR04_46).isnull()]
selected_rows

In [None]:
import seaborn as sns
sns.distplot(average_spend_data_w1.PUR04_46, label='new car', color='darkblue')
sns.distplot(average_spend_data_w1.PUR04_53, label='second hand car', color='red')
plt.xlabel('Average spend')
plt.ylabel('Density')
plt.legend()
plt.tight_layout(h_pad=2.5)

In [None]:
average_spend_data_w2.agg(['size', 'count', 'nunique'])

# E-commerce

In [None]:
#Note: E-COMMERCE in wave 1 is in E-COMMERCE DEEP DIVE split
online_purchase_data_w1 = reduced_data_w1[list(online_purchase_w1['Variable new'])]
#Note: E-COMMERCE in wave 2 is in CORE split
online_purchase_data_w2 = reduced_data_w2[list(online_purchase_w2['Variable new'])]

In [None]:
online_purchase_data_w1.agg(['size', 'count', 'nunique'])

In [None]:
online_purchase_data_w1.apply(pd.value_counts)

In [None]:
online_purchase_data_w2.agg(['size', 'count', 'nunique'])

In [None]:
online_purchase_data_w2.apply(pd.value_counts)

# When was the car bought?

In [None]:
used_car_data_w1 = reduced_data_w1[list(used_car_w1['Variable new'])]
new_car_data_w1 = reduced_data_w1[list(new_car_w1['Variable new'])]
just_car_data_w1 = reduced_data_w1[list(just_car_w1['Variable new'])]
used_car_data_w2 = reduced_data_w2[list(used_car_w2['Variable new'])]
new_car_data_w2 = reduced_data_w2[list(new_car_w2['Variable new'])]
just_car_data_w2 = reduced_data_w2[list(just_car_w2['Variable new'])]

In [None]:
frames_w1 = [new_car_data_w1, used_car_data_w1, just_car_data_w1]
result_w1 = pd.concat(frames_w1, axis=1)
result_w1 = result_w1.loc[:,~result_w1.columns.duplicated()]
result_w1

In [None]:
frames_w2 = [new_car_data_w2, used_car_data_w2, just_car_data_w2]
result_w2 = pd.concat(frames_w2, axis=1)
result_w2 = result_w2.loc[:,~result_w2.columns.duplicated()]

In [None]:
Description = {}
for var in result_w1.columns:
#(data_map[data_map['Variable'].isin(result.columns)]).Label
    Description[var] = (data_map[data_map['Variable'] == var]).Label
Description

In [None]:
new_car_data_w1.apply(pd.value_counts)

In [None]:
result_w1.sum(axis = 0, skipna = True)
#Expected:
#sum(PUR03_1_46, PUR03_2_46, PUR03_1_53, PUR03_2_53) = sum(result['PUR03_1_46'])

In [None]:
print(sum(result_w1['PUR03_1_46']) + sum(result_w1['PUR03_2_46']) + sum(result_w1['PUR03_1_53']) + sum(result_w1['PUR03_2_53']))
print(sum(result_w1['DEM12_03']))#+ sum(result['RISK_05']))

In [None]:
result_w2.sum(axis = 0, skipna = True)

In [None]:
print(sum(result_w2['PUR03_1_46']) + sum(result_w2['PUR03_2_46']) + sum(result_w2['PUR03_1_53']) + sum(result_w2['PUR03_2_53']))
print(sum(result_w2['DEM12_03']))#+ sum(result['RISK_05']))

# CORE variables that can be used as predictors
Note: Predictors must be CORE variables because touchpoints are DEEP DIVE to ensure that the same people have been asked both types of questions. TOOLS variables cannot be used as predictors for DEEP DIVE because they are not asked to the same people.

In [None]:
core_variables_w1 = (spec_file_w1.loc[spec_file_w1['Split'] == 'CORE'])['Variable new']
core_variables_w2 = (spec_file_w2.loc[spec_file_w2['Split'] == 'CORE'])['Variable new']

In [None]:
#dropping variables related to how the questionare was done
core_data_w1 = reduced_data_w1.loc[:, reduced_data_w1.columns.isin(core_variables_w1)]
core_data_w2 = reduced_data_w2.loc[:, reduced_data_w2.columns.isin(core_variables_w1)]
core_data_w1

In [None]:
reduced_data_w1['Lifestage_01'].agg(['size', 'count', 'nunique'])

In [None]:
#CORE variables that are related to personality
attitudes_variables_w1 = [x for x in core_data_w1.columns if ("QP04_" in x) and x not in ['QP04_52','QP04_53']]
interests_variables_w1 = [x for x in core_data_w1.columns if ("QP05_" in x)]
hobbies_variables_w1 = [x for x in core_data_w1.columns if ("HOB01_" in x)]
#CORE variables that are related to income
income_variables_w1 = ([x for x in core_data_w1.columns if ("DEM08_" in x)]
                      +['DEM06','DEM10_US','DEM10_ALL'])
#demographic variables that are not in spec file. Possible because they are calculated a
#rather than asked directly
non_core_dems_w1 = ([x for x in data_w1.columns if ("Lifestage_" in x)]
 + ['AUD_Generations','Q01_NET','Q01_NET_1','Q03_NET'])

In [None]:
attitudes_data_w1 = core_data_w1.loc[:, core_data_w1.columns.isin(attitudes_variables_w1)]
interests_data_w1 = core_data_w1.loc[:, core_data_w1.columns.isin(interests_variables_w1)]
hobbies_data_w1 = core_data_w1.loc[:, core_data_w1.columns.isin(hobbies_variables_w1)]
income_data_w1 = core_data_w1.loc[:, core_data_w1.columns.isin(income_variables_w1)]
non_core_dems_data_w1 = reduced_data_w1.loc[:, reduced_data_w1.columns.isin(non_core_dems_w1)]

# Importance samplaing - random forest

In [None]:
#importance sampling for the chosen target variable 'PUR03_2_46'
#using sklearn's RandomForestClassifier
#%run -i '/Users/tunrayo.adeleke-lar/OneDrive - insidemedia.net/Documents/ford_hackathon/importance_sampling.py'

# Importance sampling - mutual information criterion

In [None]:
#data_w1['TP02_046_001']

In [None]:
#org_data_w1 = org_data_w1.apply(pd.to_numeric, errors='ignore')
#threshold = 5  # the number of most relevant features
#high_score_features = []
#feature_scores = mutual_info_classif(non_core_dems_data_w1, org_data_w1['TP02_046_001'], random_state=42)
#for score, f_name in sorted(zip(feature_scores, non_core_dems_data_w1.columns), reverse=True)[:threshold]:
#        print(f_name, score)
#        high_score_features.append(f_name)
#non_core_dems_mis = non_core_dems_data_w1[high_score_features]
#print(non_core_dems_mis.columns)

In [None]:
#org_data_w1 = org_data_w1.apply(pd.to_numeric, errors='ignore')
#tps_noticed_top3_w1 = ['TP02_046_001','TP02_046_007','TP02_046_010','TP02_046_004']
#tps_helped_top3_w1 = ['TP03_046_010','TP03_046_011','TP03_046_012']
#threshold = 5  # the number of most relevant features

In [None]:
tps_noticed_top3_w1 = list(touchpoints_noticed_data_w1.apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).iloc[:, 0:4].columns)
tps_helped_top3_w1 = list(touchpoints_helped_data_w1.apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).iloc[:, 0:4].columns)
threshold = 5  # the number of most relevant features

In [None]:
high_score_features_tpn = []
for tpn in tps_noticed_top3_w1:
    feature_scores = mutual_info_classif(non_core_dems_data_w1, data_w1[tpn], random_state=42)
    for score, f_name in sorted(zip(feature_scores, non_core_dems_data_w1.columns), reverse=True)[:threshold]:
            print(f_name, score)
            high_score_features_tph.append(f_name)
    non_core_dems_mis = non_core_dems_data_w1[high_score_features_tpn]

In [None]:
high_score_features_tph = []
for tph in tps_helped_top3_w1:
    feature_scores = mutual_info_classif(non_core_dems_data_w1, org_data_w1[tph], random_state=42)
    for score, f_name in sorted(zip(feature_scores, non_core_dems_data_w1.columns), reverse=True)[:threshold]:
            print(f_name, score)
            high_score_features_tph.append(f_name)
    non_core_dems_mis = non_core_dems_data_w1[high_score_features_tph]

In [None]:
type(org_data_w1[income_data_w1.columns])

In [None]:
high_score_features_tpn = []
for tpn in tps_noticed_top3_w1:
    feature_scores = mutual_info_classif(income_data_w1, org_data_w1[tpn], random_state=42)
    for score, f_name in sorted(zip(feature_scores, income_data_w1.columns), reverse=True)[:threshold]:
            print(f_name, score)
            high_score_features.append(f_name)
    personality_mis = income_data_w1[high_score_features]
    #print(non_core_dems_mis.columns)

# Groups: definition

In [None]:
def categorise_age(row):  
    if row['Q01'] < 35:
        return 0
    else:
        return 1

In [None]:
data_w1['age > 35'] = data_w1.apply(lambda row: categorise_age(row), axis=1)
data_w2['age > 35'] = data_w2.apply(lambda row: categorise_age(row), axis=1)

In [None]:
# set valid mask
#nan_mask = np.isnan(data_w1)
#valid_mask = ~nan_mask

# create a result array
#result = np.full(data_w1.shape, np.nan)

# assign only valid cases to 
#result[valid_mask] = normalizer.fit_transform(data[valid_mask].reshape(-1,1)).reshape(data[valid_mask].shape)

In [None]:
#male:1
#female:2
#age< 35:0
#age>35:1
age_gender = data_w1[['age > 35', 'Q02']]
age_gender_w2 = data_w2[['age > 35', 'Q02']]
#(1,0): male & age< 35
#(1,1):  male & age> 35
#(2,0): female & age< 35
#(2,1):  female & age> 35

In [None]:
men_over = list(np.where((age_gender['age > 35'] == 1) & (age_gender['Q02'] == 1))[0])
men_under = list(np.where((age_gender['age > 35'] == 0) & (age_gender['Q02'] == 1))[0])
women_over = list(np.where((age_gender['age > 35'] == 1) & (age_gender['Q02'] == 2))[0])
women_under = list(np.where((age_gender['age > 35'] == 0) & (age_gender['Q02'] == 2))[0])

In [None]:
men_over_w2 = list(np.where((age_gender_w2['age > 35'] == 1) & (age_gender_w2['Q02'] == 1))[0])
men_under_w2 = list(np.where((age_gender_w2['age > 35'] == 0) & (age_gender_w2['Q02'] == 1))[0])
women_over_w2 = list(np.where((age_gender_w2['age > 35'] == 1) & (age_gender_w2['Q02'] == 2))[0])
women_under_w2 = list(np.where((age_gender_w2['age > 35'] == 0) & (age_gender_w2['Q02'] == 2))[0])

# Groups: touchpoints

In [None]:
tpn_labels = (data_map[data_map['Variable'].isin(list(touchpoints_noticed_data_w1.columns))])[['Variable', 'Label']]
tph_labels = (data_map[data_map['Variable'].isin(list(touchpoints_helped_data_w1.columns))])[['Variable', 'Label']]

In [None]:
tp_labels = pd.concat([tpn_labels, tph_labels], ignore_index=True)
#tp_labels

In [None]:
# Use str.split and str.join and astype
tp_labels['Cleaned label'] = tp_labels['Label'].str.split('|')
#print(tp_labels['Cleaned label'])
names_only = []
for l in tp_labels['Cleaned label']:
    names_only.append(l[2])

In [None]:
tp_labels['Description'] = pd.DataFrame(names_only)

In [None]:
tp_labels = tp_labels.drop(columns='Cleaned label', axis=1)

In [None]:
touchpoints_noticed_data_w1.columns = list(tp_labels['Description'][0:len(list(touchpoints_noticed_data_w1.columns))])

In [None]:
touchpoints_helped_data_w1.columns = list(tp_labels['Description'][0:len(list(touchpoints_helped_data_w1.columns))])

In [None]:
#not_selected = list((touchpoints_noticed_data_w1.iloc[men_over].apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_noticed_data_w1.iloc[men_over].apply(pd.value_counts)).iloc[1])
index = (touchpoints_noticed_data_w1.iloc[men_over].apply(pd.value_counts)).columns
df = pd.DataFrame({'Selected': selected}, index=index)
ax = df.plot.bar(legend=False)
plt.title("Touchpoints noticed: Men over 35")
#plt.legend(bbox_to_anchor = (1.05, 0.6))


plt.savefig('touchpoints_noticed_men_over.pdf', bbox_inches='tight')
plt.show()

In [None]:
#not_selected = list((touchpoints_helped_data_w1.iloc[men_over].apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_helped_data_w1.iloc[men_over].apply(pd.value_counts)).iloc[1])
index = (touchpoints_helped_data_w1.iloc[men_over].apply(pd.value_counts)).columns
df = pd.DataFrame({'Selected': selected}, index=index)
ax = df.plot.bar(legend=False)
plt.title("Touchpoints helped: Men over 35")
#plt.legend(bbox_to_anchor = (1.05, 0.6))


plt.savefig('touchpoints_helped_men_over.pdf', bbox_inches='tight')
plt.show()

In [None]:
#not_selected = list((touchpoints_noticed_data_w1.iloc[men_under].apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_noticed_data_w1.iloc[men_under].apply(pd.value_counts)).iloc[1])
index = (touchpoints_noticed_data_w1.iloc[men_under].apply(pd.value_counts)).columns
df = pd.DataFrame({'Selected': selected}, index=index)
ax = df.plot.bar(legend=False)
plt.title("Touchpoints noticed: Men under 35")
#plt.legend(bbox_to_anchor = (1.05, 0.6))


plt.savefig('touchpoints_noticed_men_under.pdf', bbox_inches='tight')
plt.show()

In [None]:
not_selected = list((touchpoints_helped_data_w1.iloc[men_under].apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_helped_data_w1.iloc[men_under].apply(pd.value_counts)).iloc[1])
index = (touchpoints_helped_data_w1.iloc[men_under].apply(pd.value_counts)).columns
df = pd.DataFrame({'Selected': selected}, index=index)
ax = df.plot.bar(legend=False)
plt.title("Touchpoints helped: Men under 35")

plt.savefig('touchpoints_helped_men_under.pdf', bbox_inches='tight')
plt.show()

In [None]:
not_selected = list((touchpoints_noticed_data_w1.iloc[women_over].apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_noticed_data_w1.iloc[women_over].apply(pd.value_counts)).iloc[1])
index = (touchpoints_noticed_data_w1.iloc[women_over].apply(pd.value_counts)).columns
df = pd.DataFrame({'Selected': selected}, index=index)
ax = df.plot.bar(legend=False)
plt.title("Touchpoints noticed: Women over 35")

plt.savefig('touchpoints_noticed_women_over.pdf', bbox_inches='tight')
plt.show()

In [None]:
not_selected = list((touchpoints_helped_data_w1.iloc[women_over].apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_helped_data_w1.iloc[women_over].apply(pd.value_counts)).iloc[1])
index = (touchpoints_helped_data_w1.iloc[women_over].apply(pd.value_counts)).columns
df = pd.DataFrame({'Selected': selected}, index=index)
ax = df.plot.bar(legend=False)
plt.title("Touchpoints helped: Women over 35")

plt.savefig('touchpoints_helped_women_over.pdf', bbox_inches='tight')
plt.show()

In [None]:
not_selected = list((touchpoints_noticed_data_w1.iloc[women_under].apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_noticed_data_w1.iloc[women_under].apply(pd.value_counts)).iloc[1])
index = (touchpoints_noticed_data_w1.iloc[women_under].apply(pd.value_counts)).columns
df = pd.DataFrame({'Selected': selected}, index=index)
ax = df.plot.bar(legend=False)
plt.title("Touchpoints noticed: Women under 35")

plt.savefig('touchpoints_noticed_women_under.pdf', bbox_inches='tight')
plt.show()

In [None]:
not_selected = list((touchpoints_helped_data_w1.iloc[women_under].apply(pd.value_counts)).iloc[0])
selected = list((touchpoints_helped_data_w1.iloc[women_under].apply(pd.value_counts)).iloc[1])
index = (touchpoints_helped_data_w1.iloc[women_under].apply(pd.value_counts)).columns
df = pd.DataFrame({'Selected': selected}, index=index)
ax = df.plot.bar(legend=False)
plt.title("Touchpoints helped: Women under 35")

plt.savefig('touchpoints_helped_women_under.pdf', bbox_inches='tight')
plt.show()

In [None]:
top5_noticed_men_over = list(touchpoints_noticed_data_w1.iloc[men_over].apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).columns)[0:5]
top5_noticed_men_under = list(touchpoints_noticed_data_w1.iloc[men_under].apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).columns)[0:5]
top5_noticed_women_over = list(touchpoints_noticed_data_w1.iloc[women_over].apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).columns)[0:5]
top5_noticed_women_under = list(touchpoints_noticed_data_w1.iloc[women_under].apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).columns)[0:5]
pd.DataFrame([top5_noticed_men_over, top5_noticed_men_under, top5_noticed_women_over, top5_noticed_women_under], index = ['men over 35', 'men under 35', 'women over 35', 'women under 35'])

In [None]:
top5_helped_men_over = list(touchpoints_helped_data_w1.iloc[men_over].apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).columns)[0:5]
top5_helped_men_under = list(touchpoints_helped_data_w1.iloc[men_under].apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).columns)[0:5]
top5_helped_women_over = list(touchpoints_helped_data_w1.iloc[women_over].apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).columns)[0:5]
top5_helped_women_under = list(touchpoints_helped_data_w1.iloc[women_under].apply(pd.value_counts).sort_values(by = 1, axis = 1, ascending=False).columns)[0:5]
pd.DataFrame([top5_helped_men_over, top5_helped_men_under, top5_helped_women_over, top5_helped_women_under], index = ['men over 35', 'men under 35', 'women over 35', 'women under 35'])

# Groups: diary questions

# Groups: average spend

No idea what the units are so this data is not so useful as can be seen by plotting the prices of the cheapest and most expensive Ford models that are currently available on their website.

In [None]:
import seaborn as sns
sns.distplot(average_spend_data_w1.PUR04_46.iloc[men_over], label='men over 35', color='darkblue')
sns.distplot(average_spend_data_w1.PUR04_46.iloc[women_over], label='women over 35', color='red')
plt.axvline(x=19995)
plt.axvline(x=63425)
plt.xlabel('Average spend')
plt.ylabel('Density')
plt.legend()
plt.tight_layout(h_pad=2.5)

In [None]:
import seaborn as sns
sns.distplot(average_spend_data_w1.PUR04_46.iloc[men_under], label='men under 35', color='darkblue')
sns.distplot(average_spend_data_w1.PUR04_46.iloc[women_under], label='women under 35', color='red')
plt.axvline(x=19995)
plt.axvline(x=63425)
plt.xlabel('Average spend')
plt.ylabel('Density')
plt.legend()
plt.tight_layout(h_pad=2.5)

In [None]:
len(reduced_data_w2['PUR04_57'])

In [None]:
reduced_data_w2['PUR04_42'].nlargest(10, keep='first')

In [None]:
pd.DataFrame({'min': [reduced_data_w2['PUR04_57'].min(axis = 0),reduced_data_w2['PUR04_42'].min(axis = 0),
                      reduced_data_w2['PUR04_46'].min(axis = 0)],
              'max': [reduced_data_w2['PUR04_57'].max(axis = 0),
                      reduced_data_w2['PUR04_42'].max(axis = 0),
                      reduced_data_w2['PUR04_46'].max(axis = 0)]}, index=['Tablet/ipad','Mobile','New car'])

In [None]:
reduced_data_w2['PUR04_57'].nlargest(10, keep='first')

In [None]:
reduced_data_w2['PUR04_46'].nlargest(10, keep='first')

In [None]:
reduced_data_w2['PUR04_46'].nsmallest(10, keep='first')

In [None]:
reduced_data_w2['PUR04_46'].mean()
#333972.1212121212

In [None]:
reduced_data_w2['PUR04_46'].median()
#63920.0

In [None]:
reduced_data_w2['PUR04_46'].quantile(0.1) # 10th percentile
#20258.0

In [None]:
reduced_data_w2['PUR04_46'].quantile(0.6)
#120353.19999999994

In [None]:
reduced_data_w2['PUR04_46'].quantile(0.75)
#411789.0

In [None]:
reduced_data_w2['PUR04_46'].quantile(0.9) # 90th percentile
#786494.0000000001

# Groups: ecommerce

In [None]:
def norm_values(df):
    return df.value_counts(normalize=True)

In [None]:
frame = {'Men over 35': online_purchase_data_w2.iloc[men_over_w2].apply(norm_values).EC02_035, 'Men under 35': online_purchase_data_w2.iloc[men_under_w2].apply(norm_values).EC02_035,
        'Women over 35': online_purchase_data_w2.iloc[women_over_w2].apply(norm_values).EC02_035, 'Women under 35': online_purchase_data_w2.iloc[women_under_w2].apply(norm_values).EC02_035 }
ecomm_counts = pd.DataFrame(frame)

In [None]:
not_selected = list(ecomm_counts.iloc[0])
selected = list(ecomm_counts.iloc[1])
index = pd.DataFrame(ecomm_counts.columns)
df = pd.DataFrame({'Not selected': not_selected,
                   'Selected': selected}, index=ecomm_counts.columns)
ax = df.plot.bar(stacked=True)
plt.title("")
plt.legend(bbox_to_anchor = (1.05, 0.6))

#plt.tight_layout(h_pad=5)
plt.savefig('ecomm_counts.pdf', bbox_inches='tight')
plt.show()

# Personality: attitudes

In [None]:
attitudes_data_w1 = attitudes_data_w1.fillna(0)
average_spend_data_w1['PUR04_46'] = average_spend_data_w1['PUR04_46'].fillna(0)

In [None]:
pd.DataFrame([attitudes_data_w1.min(axis = 0), attitudes_data_w1.max(axis = 0)], index = ['min', 'max'])
#1: Completely disgree
#5: Completely agree

In [None]:
pd.DataFrame(mutual_info_classif(attitudes_data_w1, average_spend_data_w1['PUR04_46'], random_state=42), index =attitudes_data_w1.columns, columns=['MI'])

# Personality: interests

In [None]:
interests_data_w1 = interests_data_w1.fillna(0)

In [None]:
pd.DataFrame([interests_data_w1.min(axis = 0), interests_data_w1.max(axis = 0)], index = ['min', 'max'])

In [None]:
pd.DataFrame(mutual_info_classif(interests_data_w1, average_spend_data_w1['PUR04_46'], random_state=42), index =interests_data_w1.columns, columns=['MI'])

# Personality: hobbies

In [None]:
hobbies_data_w1 = hobbies_data_w1.fillna(0)

In [None]:
pd.DataFrame([hobbies_data_w1.min(axis = 0), hobbies_data_w1.max(axis = 0)], index = ['min', 'max'])

In [None]:
pd.DataFrame(mutual_info_classif(hobbies_data_w1, average_spend_data_w1['PUR04_46'], random_state=42), index =hobbies_data_w1.columns, columns=['MI'])

# Income

In [None]:
income_data_w1 = income_data_w1.fillna(0)

In [None]:
pd.DataFrame([income_data_w1.min(axis = 0), income_data_w1.max(axis = 0)], index = ['min', 'max'])
#998:non of the above

In [None]:
pd.DataFrame(mutual_info_classif(income_data_w1, average_spend_data_w1['PUR04_46'], random_state=42), index =income_data_w1.columns, columns=['MI'])

# Non-core demographics

In [None]:
non_core_dems_data_w1 = non_core_dems_data_w1.fillna(0)

In [None]:
pd.DataFrame([non_core_dems_data_w1.min(axis = 0), non_core_dems_data_w1.max(axis = 0)], index = ['min', 'max'])
#998:non of the above

In [None]:
pd.DataFrame(mutual_info_classif(non_core_dems_data_w1, average_spend_data_w1['PUR04_46'], random_state=42), index =non_core_dems_data_w1.columns, columns=['MI'])

# Groups: environmental issues

In [None]:
#QP04_53 is only asked to people that already have a car

# When are people watching TV?

In [None]:
#diary questions on when people are watching TV
#diary_variables_w1 = ['DIA01_21','DIA02_21','DIA03_21','DIA04_21','DIA05_21']
diary_variables_w1 = ['DIA01_21','DIA02_21','DIA03_21','DIA04_21']
diary_data_w1 = reduced_data_w1[diary_variables_w1]

In [None]:
diary_labels = labels[labels['Variable'].isin(diary_variables_w1)].reset_index()

In [None]:
names_only = []
for l in diary_labels['Cleaned label']:
    names_only.append(l[1])

In [None]:
description = pd.DataFrame(names_only,columns=['Description'])

In [None]:
diary_labels['Description']=pd.DataFrame(names_only)

In [None]:
diary_data_w1.columns = list(diary_labels['Description'][0:len(list(diary_data_w1.columns))])

In [None]:
diary_data_w1.agg(['size', 'count', 'nunique'])

In [None]:
diary_data_w1.apply(pd.value_counts)

In [None]:
diary_norm_w1 = diary_data_w1.apply(norm_values)

In [None]:
#not_selected = list(diary_norm_w1.iloc[0])
selected = list(diary_norm_w1.iloc[1])
index = pd.DataFrame(diary_norm_w1.columns)
df = pd.DataFrame({'Selected': selected}, index=diary_norm_w1.columns)
ax = df.plot.bar(legend=False)
plt.title("")
#plt.legend()

#plt.tight_layout(h_pad=5)
plt.title("TV times")

plt.savefig('diary_counts.pdf', bbox_inches='tight')
plt.show()

# Who bought a car in the last 12 months?

In [None]:
#1:13-17
#2:18-24
#3:25-34
#4:35-44
#5:45-54
#6:55+
(x, y) = (reduced_data_w1['Q01_NET'],reduced_data_w1['PUR03_2_46'])

In [None]:
x.agg(['size', 'count', 'nunique'])

In [None]:
y.agg(['size', 'count', 'nunique'])

In [None]:
x.value_counts()

In [None]:
y.value_counts()

In [None]:
df = reduced_data_w1[reduced_data_w1['PUR03_2_46']==1][['Q01_NET']]
df.value_counts()

In [None]:
#df = df.replace({1: '13-17',2: '18-24',3: '25-34',4: '35-44',5: '45-54',6: '55+'})

In [None]:
df

In [None]:
import seaborn as sns
g = sns.distplot(df)
plt.xlabel('Age')
plt.ylabel('')
#g.set_xticklabels(['13-17','18-24','25-34','35-44','45-54','55+'])
plt.tight_layout(h_pad=2.5)

plt.title("Age of car buyers in the last 12 months")

plt.savefig('age_car_buyers.pdf', bbox_inches='tight')
plt.show()

In [None]:
reduced_data_w2['DEM152'].agg(['size', 'count', 'nunique'])