<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/quarterly_survey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Survey data preparation

## Importing libraries

In [1]:
#Upgrading Plotly
!pip install plotly --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Importing visualization libraries
import plotly.express as px
import seaborn as sns
from ipywidgets import interact

## Loading Data

In [4]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

Saving survey_Q2.xlsx to survey_Q2.xlsx


In [5]:
# Storing loaded data to a pandas dataframe
import io
df1 = pd.read_excel((io.BytesIO(uploaded1['survey_Q2.xlsx'])))

In [6]:
# Asserting the loaded data has the correct columns
def check_columns():
  survey_col = ['answer_form_id', 'is_anonymous',  'cust_domain_id', 'mh_customer_id', 'mh_context', 'channel_name', 'broadcast_id', 'completed',
                'answered_timestamp', 'Q1', 'Q2_A1',  'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q3', 'Q4_A1', 'Q4_A2', 'Q5']
  assert df1.columns.to_list() == survey_col, f'Columns should be: {[i for i in survey_col]}'
  print(f'The file contains all correct columns')

check_columns()

The file contains all correct columns


## Data cleaning

In [7]:
# Creating function to clean up the dataframe
def df1_clean_up(df):
  # Converting timestamps to datetime
  df = df.assign(mh_context = pd.to_datetime(df['mh_context']),
                 answered_timestamp = pd.to_datetime(df['answered_timestamp']))
  # Converting timestamps to datetime
  df = df.sort_values(by='mh_context').reset_index(drop=True)
  # Assigning categories from datetime
  df = df.assign(year = df['mh_context'].dt.year,
                 quarter = df['mh_context'].dt.quarter,
                 month = df['mh_context'].dt.month,
                 month_name = df['mh_context'].dt.month_name(),
                 weeknum = df['mh_context'].dt.isocalendar().week,
                 day = df['mh_context'].dt.day_name(),
                 hour = df['answered_timestamp'].dt.hour
                 )
  # Replacing quarter numbers to string
  quarter_dict = {1: 'Q1', 2: 'Q2', 3: 'Q3', 4: 'Q4'}
  df['quarter'].replace(quarter_dict, inplace=True)
  # Creating year-quarter category
  df = df.assign(period = df['year'].astype(str) + "-" + df['quarter'])
  # Replacing non unicode characters
  df.replace(to_replace=r'Ã¥', value='å', regex=True, inplace=True)
  df.replace(to_replace=r'Ã¦', value='Å', regex=True, inplace=True)
  df.replace(to_replace=r'Ã.', value='ø', regex=True, inplace=True)
  # Replacing X values with 1
  x_cols = ['Q2_A1', 'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q4_A1', 'Q4_A2']
  df[x_cols] = df[x_cols].copy().replace('X', value=1)
  return df

In [8]:
# Cleaning up the dataframe
df1 = df1_clean_up(df1)

In [9]:
#df1['Q2_A1'].value_counts(normalize=True, dropna=False)

In [10]:
# Melting dataframe questions to long format
melt1_cols = ['answer_form_id', 'is_anonymous',  'cust_domain_id', 'mh_customer_id', 'mh_context', 'channel_name', 'completed',
               'year',	'quarter',	'period', 'month',	'month_name', 'weeknum',	'day',	'hour']
melt2_cols = ['Q1', 'Q2_A1',  'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q3', 'Q4_A1', 'Q4_A2', 'Q5']
df1_melt = pd.melt(df1, id_vars=[i for i in melt1_cols],
        value_vars=[i for i in melt2_cols], var_name='question', value_name='answer').dropna(axis=0, subset=['answer'])

In [11]:
# Splitting melted dataframe into survey sections
df1_a = df1_melt[df1_melt['question'].isin(['Q2_A1', 'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9'])].reset_index(drop=True)
df1_b = df1_melt[df1_melt['question'].isin(['Q4_A1', 'Q4_A2'])].reset_index(drop=True)
df1_c = df1_melt[df1_melt['question'].isin(['Q1', 'Q3', 'Q5'])].reset_index(drop=True)

In [19]:
# Creating summary table for multiple choice questions
def summary_table_1a(df):
  pv = df.pivot_table(index='question', columns=['period'], values='answer', aggfunc='sum').reset_index()
  dict_1a = {'Q2_A1':'Bedre_tilbud', 'Q2_A2':'Tvangsavvikling', 'Q2_A3':'Privat_abo', 'Q2_A4':'Dekning',
                 'Q2_A5':'Data/surfekvalitet', 'Q2_A7':'Utlandstjenesten', 'Q2_A8':'Kundeservice', 'Q2_A9':'Annet'}
  pv['question'].replace(to_replace=dict_1a, inplace=True)
  return pv
  
tb_1a = summary_table_1a(df1_a)
tb_1a

period,question,2021-Q1,2021-Q2,2021-Q3,2021-Q4,2022-Q1,2022-Q2,2022-Q3
0,Bedre_tilbud,62.0,62.0,60.0,58.0,49.0,56.0,13.0
1,Tvangsavvikling,1.0,,4.0,1.0,2.0,1.0,
2,Privat_abo,11.0,16.0,13.0,8.0,9.0,7.0,2.0
3,Dekning,28.0,32.0,55.0,80.0,64.0,55.0,20.0
4,Data/surfekvalitet,23.0,43.0,46.0,47.0,41.0,39.0,11.0
5,Utlandstjenesten,2.0,3.0,6.0,7.0,7.0,13.0,2.0
6,Kundeservice,23.0,13.0,16.0,16.0,29.0,15.0,3.0
7,Annet,25.0,40.0,23.0,22.0,35.0,17.0,4.0


In [20]:
# Creating summary table for true/false question
def summary_table_1b(df):
  pv = df.pivot_table(index='question', columns=['period'], values='answer', aggfunc='sum').reset_index()
  dict_1b = {'Q4_A1':'Ble_kontaktet', 'Q4_A2':'Initiativ_selv'}
  pv['question'].replace(to_replace=dict_1b, inplace=True)
  return pv

tb_1b = summary_table_1b(df1_b)
tb_1b

period,question,2021-Q1,2021-Q2,2021-Q3,2021-Q4,2022-Q1,2022-Q2,2022-Q3
0,Ble_kontaktet,70.0,82.0,80.0,83.0,73.0,76.0,22.0
1,Initiativ_selv,49.0,71.0,77.0,75.0,83.0,60.0,10.0


In [21]:
# Creating function to compute totals
def compute_total_by_quarter(df):
  total_dict = {i:len(df[df['period']==i]['answer_form_id'].unique()) for i in df['period'].unique().tolist()}
  return pd.DataFrame(total_dict.items(), columns=['period', 'total']).set_index('period').T

tb_1t = compute_total_by_quarter(df1_a)
tb_1t

period,2021-Q1,2021-Q2,2021-Q3,2021-Q4,2022-Q1,2022-Q2,2022-Q3
total,113,149,154,156,155,132,32


In [22]:
# Creating melted table for true/false question
def melt_table_1t(df):
  df_m = df.T.reset_index()
  return df_m[df_m['period']!=df_m['period'].unique()[-1]] # Dropping current period
  
tb_1t_m = melt_table_1t(tb_1t)
tb_1t_m.tail()

Unnamed: 0,period,total
1,2021-Q2,149
2,2021-Q3,154
3,2021-Q4,156
4,2022-Q1,155
5,2022-Q2,132


In [23]:
# Creating melted table with totals for multiple choice questions
def melt_table_w_totals(df):
  df_m = pd.melt(df, id_vars='question', value_vars=[i for i in df.columns if i !='question'], value_name='answer')
  df_m = pd.merge(df_m, tb_1t_m, how='inner', on='period')
  df_m = df_m.assign(perc = round(df_m['answer']/df_m['total'], 2))
  return df_m
  
tb_1a_m = melt_table_w_totals(tb_1a)
tb_1a_m = tb_1a_m.assign(dekning = np.where(tb_1a_m['question'].isin(['Dekning', 'Data/surfekvalitet']), 'Dekning', 'Annet'))
tb_1a_m.tail()

Unnamed: 0,question,period,answer,total,perc,dekning
43,Dekning,2022-Q2,55.0,132,0.42,Dekning
44,Data/surfekvalitet,2022-Q2,39.0,132,0.3,Dekning
45,Utlandstjenesten,2022-Q2,13.0,132,0.1,Annet
46,Kundeservice,2022-Q2,15.0,132,0.11,Annet
47,Annet,2022-Q2,17.0,132,0.13,Annet


In [24]:
# Creating melted table with totals for true/false question
tb_1b_m = melt_table_w_totals(tb_1b)
tb_1b_m.tail()

Unnamed: 0,question,period,answer,total,perc
7,Initiativ_selv,2021-Q4,75.0,156,0.48
8,Ble_kontaktet,2022-Q1,73.0,155,0.47
9,Initiativ_selv,2022-Q1,83.0,155,0.54
10,Ble_kontaktet,2022-Q2,76.0,132,0.58
11,Initiativ_selv,2022-Q2,60.0,132,0.45


## Free text parsing

In [141]:
# Listing stop words
stop_words = ['annen','jeg','på','og','ikke','å','har','til','det','i','som','med','dere','for','fra','er','en','var','at','ice','om','ble',
              'vi','så','sa','hos','et','meg','da','de','fikk','enn','av','kan','men','mye','når','bare','etter','våre', 'ved','hatt','',
              'kunne', 'hadde', 'andre', 'også', 'sett', 'før', 'over', 'den']

In [134]:
# Creating function to parse free text for analysis
def parse_free_text(df, period, question, top):
  """ Parse free text from questions: Q1, Q3, Q5 """
  tx_list = df[(df['question']==question) & (df['period']=='2022-Q2')]['answer'].str.lower().replace(r'[!.,"]', '', regex=True).str.cat(sep='').split(' ')
  tx_list = pd.DataFrame(data=tx_list, columns=['word'])
  tx_list = tx_list[~tx_list['word'].isin(stop_words)]
  cond_list = [tx_list['word'].str.contains('dek'), tx_list['word'].str.contains('data'), tx_list['word'].str.contains('nett'), tx_list['word'].str.contains('signal'),
               tx_list['word'].str.contains('samtale'), tx_list['word'].str.contains('ring'), tx_list['word'].str.contains('gb'),
               tx_list['word'].str.contains('pris'), tx_list['word'].str.contains('tilbud'), tx_list['word'].str.contains('kr'), tx_list['word'].str.contains('billig'),
               tx_list['word'].str.contains('nok'), tx_list['word'].str.contains('fri'), tx_list['word'].str.contains('betale'),
               tx_list['word'].str.contains('kunde'), tx_list['word'].str.contains('servi')]
  choice_list = ['dekning','dekning','dekning','dekning','dekning','dekning','dekning','pris/tilbud','pris/tilbud','pris/tilbud','pris/tilbud',
                 'pris/tilbud','pris/tilbud','pris/tilbud','kundeservice','kundeservice']
  tx_col = tx_list.assign(category = np.select(cond_list, choice_list, 'annet')).reset_index(drop=True)
  
  tx_top = pd.DataFrame(tx_col.value_counts(), columns=['counter']).reset_index()[:top]
  tx_top = tx_top.assign(perc = round(tx_top['counter']/sum(tx_top['counter']),2))
  return (tx_col, tx_top)

In [175]:
import collections

In [178]:
# Testing new text parsin algorithm
x = df1_c[(df1_c['question']=='Q5')&(df1_c['period']=='2022-Q2')][['mh_context','answer']].reset_index(drop=True).copy()
#x = x.assign(x['answer'].str.replace())
ct = x['answer'].str.lower().str.strip().str.split().str.replace('billigere', 'pris/tilbud')[0] #=> First replace every word by its root, then clean, split and Counter
ct

nan

In [135]:
# Parsing free text from question Q1
(df1_d, df1_e) = parse_free_text(df1_c, '2022-Q2', 'Q1', 20)

In [136]:
# Text dataframe from Q1 in long format for word cloud
df1_d.tail()

Unnamed: 0,word,category
784,telenor-nettdårlig,dekning
785,ustabilt,annet
786,nett,dekning
787,dårlig,annet
788,dekning,dekning


In [None]:
# Text dataframe from Q1 aggregated by frequency
df1_e.head()

In [142]:
# Parsing free text from question Q5
(df1_f, df1_g) = parse_free_text(df1_c, '2022-Q2', 'Q5', 20)

In [None]:
# Text dataframe from Q5 in long format for word cloud
df1_f.tail()

In [None]:
# Text dataframe from Q5 in long format for word cloud
df1_g

## Exporting results to local drive

In [None]:
# Exporting files to excel
with pd.ExcelWriter('2022_Q2_cleaned_survey.xlsx', engine='openpyxl') as writer:
  df1.to_excel(writer, sheet_name='00_clean_survey', index=False)
  df1_a.to_excel(writer, sheet_name='01_num_data_8q', index=False)
  tb_1a.to_excel(writer, sheet_name='02_table_8q', index=False)
  df1_b.to_excel(writer, sheet_name='03_num_data_2q', index=False)
  tb_1b.to_excel(writer, sheet_name='04_table_2q', index=False)
  tb_1t.to_excel(writer, sheet_name='05_period_totals', index=False)
  df1_c.to_excel(writer, sheet_name='06_text_data', index=False)
  df1_d.to_excel(writer, sheet_name='07_text_ind', index=False)
  
files.download('2022_Q2_cleaned_survey.xlsx')

## Data Visualization

In [34]:
#@interact(Columns = survey_col)
#def view_columns_proportion(Columns):
#    return df1[Columns].value_counts(normalize=True, dropna=False)

In [35]:
# List of questions for interactive plot
year_list = df1['year'].unique().tolist()
period_list = df1['period'].unique().tolist()

In [None]:
# Plotting figure 1
@interact(Period = period_list)
def plot_fig_1(Period):
  df = tb_1a[['question', Period]].copy()
  df = df.assign(perc = round(df[Period]/tb_1t[Period][0], 2))
  df = df.sort_values(by='perc', ascending=False)
  fig_1 = px.bar(df, x='perc', y='question', title=f'{Period}',
                       text_auto='1%', width=400, height=500)
  fig_1.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, yaxis=dict(autorange="reversed"), title_x=0.5, xaxis_tickformat='1%')
  fig_1.update_traces(marker_color='#ecc541')
  fig_1.show()

In [None]:
# Plotting figure 2
@interact(Period = period_list)
def plot_fig_2(Period):
  df = tb_1b[['question', Period]].copy()
  df = df.assign(perc = round(df[Period]/tb_1t[Period][0], 2),
                 kontaktmodus = Period)
  df = df.sort_values(by='perc', ascending=False)
  colors_fig_2 = {'Ble_kontaktet': '#ecc541', 'Initiativ_selv': '#929291'}
  fig_2 = px.bar(df, x='kontaktmodus', y='perc', color='question', title=f'{Period}', barmode='stack', 
                       text_auto='1%', width=400, height=500, color_discrete_map=colors_fig_2)
  fig_2.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat='1%')
  fig_2.show()

In [None]:
# Plotting figure 3
def plot_fig_3():
  cat_ord_3 = {'question': ['Dekning', 'Bedre_tilbud', 'Data/surfekvalitet', 'Annet', 'Kundeservice',
                             'Utlandstjenesten', 'Privat_abo', 'Tvangsavvikling']}
  colors_fig_3 = {'Dekning':'#ecc541', 'Bedre_tilbud':'#929291', 'Data/surfekvalitet':'#0080ff', 'Annet':'#be3e84', 'Kundeservice':'#987d27',
               'Utlandstjenesten':'#444e60', 'Privat_abo':'#00ffff', 'Tvangsavvikling':'#1c1915'}
  fig_3 = px.bar(tb_1a_m, x='period', y='perc', color='question', category_orders=cat_ord_3, 
                  text_auto='1%', title=f'Avgangsundersøkelse over tid', width=800, height=500)#,color_discrete_map=colors_fig_3)
  fig_3.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  fig_3.show()

plot_fig_3()

In [None]:
# Plotting figure 4
def plot_fig_4():
  cat_ord_4 = {'question': ['Dekning', 'Bedre_tilbud', 'Data/surfekvalitet', 'Annet', 'Kundeservice',
                             'Utlandstjenesten', 'Privat_abo', 'Tvangsavvikling']}
  fig_4 = px.area(tb_1a_m, x='period', y='perc', facet_col='question', color='question', facet_col_wrap=2, category_orders=cat_ord_4,
                  text='perc', facet_col_spacing=0.1, title=f'Trend etter årsak', width=900, height=500)
  fig_4.update_layout(title_font_size=16, title_x=0.5)
  fig_4.update_traces(textfont_size=10, textposition='top left', texttemplate='%{y:1%}')
  fig_4.update_yaxes(tickformat='1%', tickfont = dict(size=10))
  fig_4.update_xaxes(tickfont = dict(size=10))
  fig_4.show()

plot_fig_4()

In [None]:
# Plotting figure 5
def plot_fig_5():
  colors_5 = {'Ble_kontaktet': '#ecc541', 'Initiativ_selv': '#929291'}
  fig_5 = px.bar(tb_1b_m, x='period', y='perc', color='question', title=f'Kontakttype over tid',
                  text_auto='1%', width=800, height=500, color_discrete_map=colors_5)
  fig_5.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  fig_5.show()

plot_fig_5()

In [None]:
# Plotting figure 6
def plot_fig_6():
  colors_6 = {'Dekning': '#ecc541', 'Annet': '#929291'}
  df = tb_1a_m.copy().groupby(by=['period', 'dekning']).agg(dekning_samlet = ('perc', 'sum')).reset_index()
  fig_6 = px.bar(df, x='period', y='dekning_samlet', color='dekning', title='Dekning kontra andre årsaker over tid', barmode='group',
                   text_auto='1%', width=800, height=500, color_discrete_map=colors_6)
  fig_6.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  #fig_6.update_yaxes(range=[0.0, 1])
  fig_6.show()

plot_fig_6()

In [None]:
# Plotting figure 7
def plot_fig_7():
  df = df1[~(df1['period']=='2022-Q3')][['year','mh_context', 'month']].copy()
  df = df.groupby(by=['mh_context']).agg(responses = ('year', 'count')).reset_index()
  df = df.rename(columns={'mh_context': 'date'})
  fig_7 = px.histogram(df, x='date', y='responses', text_auto=True,
                        nbins=18, title=f'Antall svar over tid', width=800, height=500)
  fig_7.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_7.update_traces(marker_color='#ecc541')
  fig_7.show()

plot_fig_7()

In [None]:
# Plotting figure 8
@interact(Year = year_list)
def plot_fig_8(Year):
  df = df1[df1['year']==Year][['year', 'day']].copy()
  df = df.groupby(by=['year', 'day']).agg(responses = ('year', 'count')).reset_index()
  df = df.assign(day = df['day'].str[:3])
  cat_ord_8 = {'day': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']}
  fig_8 = px.histogram(df, x='day', y='responses', text_auto=True, category_orders=cat_ord_8,
                        title=f'{Year} - Svarer etter ukedager', width=400, height=500)
  fig_8.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_8.update_traces(marker_color='#ecc541')
  fig_8.show()

In [None]:
# Plotting figure 9
@interact(Year = year_list)
def plot_fig_9(Year):
  df = df1[df1['year']==Year][['year', 'hour']].copy()
  df = df.groupby(by=['year', 'hour']).agg(responses = ('year', 'count')).reset_index()
  df = df.assign(perc = round(df['responses']/sum(df['responses']), 3))
  fig_9 = px.histogram(df, x='hour', y='perc', text_auto='1%', nbins=12,
                        title=f'{Year} - Svarer etter timer', width=400, height=500)
  fig_9.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_9.update_traces(marker_color='#ecc541')
  fig_9.update_yaxes(tickformat='1%')
  fig_9.show()

In [None]:
# Plotting figure 10
def plot_fig_10():
  df = df1_e.groupby(by=['category']).agg(perc_total = ('perc', 'sum')).reset_index().copy()
  colors_10 = {'dekning':'#ecc541', 'pris/tilbud':'#b2b2b2', 'kundeservice':'#ff6973', 'annet':'#ff8700'}
  fig_10 = px.bar(df, x='category', y='perc_total', color='category', title='Hva gjorde at dere valgte å forlate oss akkurat nå?', 
                       text_auto='1%', width=400, height=500, color_discrete_map=colors_10)
  fig_10.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=13, title_x=0.5, yaxis_tickformat='1%')
  fig_10.show()

plot_fig_10()

In [None]:
# Plotting figure 11
def plot_fig_11():
  df = df1_g.groupby(by=['category']).agg(perc_total = ('perc', 'sum')).reset_index().copy()
  colors_11 = {'dekning':'#ecc541', 'pris/tilbud':'#b2b2b2', 'kundeservice':'#ff6973', 'annet':'#ff8700'}
  fig_11 = px.bar(df, x='category', y='perc_total', color='category', title='Hva gjorde at dere valgte den nye operatøren?', 
                       text_auto='1%', width=400, height=500, color_discrete_map=colors_11)
  fig_11.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=13, title_x=0.5, yaxis_tickformat='1%')
  fig_11.show()

plot_fig_11()