<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/quarterly_survey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Survey data preparation

## Importing libraries

In [None]:
#Upgrading Plotly
!pip install plotly --upgrade

In [1]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

In [3]:
# Storing loaded data to a pandas dataframe
import io
df1 = pd.read_excel((io.BytesIO(uploaded1['survey_Q2.xlsx'])))

In [4]:
# Asserting the loaded data has the correct columns
def check_columns():
  survey_col = ['answer_form_id', 'is_anonymous',  'cust_domain_id', 'mh_customer_id', 'mh_context', 'channel_name', 'broadcast_id', 'completed',
                'answered_timestamp', 'Q1', 'Q2_A1',  'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q3', 'Q4_A1', 'Q4_A2', 'Q5']
  assert df1.columns.to_list() == survey_col, f'Columns should be: {[i for i in survey_col]}'
  print(f'The file contains all correct columns')

check_columns()

The file contains all correct columns


## Data cleaning

In [65]:
# Creating function to clean up the dataframe
def df1_clean_up(df):
  # Converting timestamps to datetime
  df = df.assign(mh_context = pd.to_datetime(df['mh_context']),
                 answered_timestamp = pd.to_datetime(df['answered_timestamp']))
  # Converting timestamps to datetime
  df = df.sort_values(by='mh_context').reset_index(drop=True)
  # Assigning categories from datetime
  df = df.assign(year = df['mh_context'].dt.year,
                 quarter = df['mh_context'].dt.quarter,
                 month = df['mh_context'].dt.month,
                 month_name = df['mh_context'].dt.month_name(),
                 weeknum = df['mh_context'].dt.isocalendar().week,
                 day = df['mh_context'].dt.day_name(),
                 hour = df['answered_timestamp'].dt.hour
                 )
  # Replacing quarter numbers to string
  quarter_dict = {1: 'Q1', 2: 'Q2', 3: 'Q3', 4: 'Q4'}
  df['quarter'].replace(quarter_dict, inplace=True)
  # Creating year-quarter category
  df = df.assign(period = df['year'].astype(str) + "-" + df['quarter'])
  # Replacing non unicode characters
  df.replace(to_replace=r'Ã¥', value='å', regex=True, inplace=True)
  df.replace(to_replace=r'Ã¦', value='Å', regex=True, inplace=True)
  df.replace(to_replace=r'Ã.', value='ø', regex=True, inplace=True)
  # Replacing X values with 1
  x_cols = ['Q2_A1', 'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q4_A1', 'Q4_A2']
  df[x_cols] = df[x_cols].copy().replace('X', value=1)
  return df

In [66]:
# Cleaning up the dataframe
df1 = df1_clean_up(df1)

In [7]:
#df1['Q2_A1'].value_counts(normalize=True, dropna=False)

In [67]:
# Melting dataframe questions to long format
melt1_cols = ['answer_form_id', 'is_anonymous',  'cust_domain_id', 'mh_customer_id', 'mh_context', 'channel_name', 'completed',
               'year',	'quarter',	'period', 'month',	'month_name', 'weeknum',	'day',	'hour']
melt2_cols = ['Q1', 'Q2_A1',  'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q3', 'Q4_A1', 'Q4_A2', 'Q5']
df1_melt = pd.melt(df1, id_vars=[i for i in melt1_cols],
        value_vars=[i for i in melt2_cols], var_name='question', value_name='answer').dropna(axis=0, subset=['answer'])

In [68]:
# Splitting melted dataframe into survey sections
df1_a = df1_melt[df1_melt['question'].isin(['Q2_A1', 'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9'])].reset_index(drop=True)
df1_b = df1_melt[df1_melt['question'].isin(['Q4_A1', 'Q4_A2'])].reset_index(drop=True)
df1_c = df1_melt[df1_melt['question'].isin(['Q1', 'Q3', 'Q5'])].reset_index(drop=True)

In [None]:
# Creating summary table for multiple choice questions
def summary_table_1a(df):
  pv = df.pivot_table(index='question', columns=['period'], values='answer', aggfunc='sum').reset_index()
  dict_1a = {'Q2_A1':'Bedre_tilbud', 'Q2_A2':'Tvangsavvikling', 'Q2_A3':'Privat_abo', 'Q2_A4':'Dekning',
                 'Q2_A5':'Data/surfekvalitet', 'Q2_A7':'Utlandstjenesten', 'Q2_A8':'Kundeservice', 'Q2_A9':'Annet'}
  pv['question'].replace(to_replace=dict_1a, inplace=True)
  return pv
  
tb_1a = summary_table_1a(df1_a)
tb_1a

In [None]:
# Creating summary table for true/false question
def summary_table_1b(df):
  pv = df.pivot_table(index='question', columns=['period'], values='answer', aggfunc='sum').reset_index()
  dict_1b = {'Q4_A1':'Ble_kontaktet', 'Q4_A2':'Initiativ_selv'}
  pv['question'].replace(to_replace=dict_1b, inplace=True)
  return pv

tb_1b = summary_table_1b(df1_b)
tb_1b

In [None]:
# Creating function to compute totals
def compute_total_by_quarter(df):
  total_dict = {i:len(df[df['period']==i]['answer_form_id'].unique()) for i in df['period'].unique().tolist()}
  return pd.DataFrame(total_dict.items(), columns=['period', 'total']).set_index('period').T

tb_1t = compute_total_by_quarter(df1_a)
tb_1t

In [None]:
# Creating melted table for true/false question
def melt_table_1t(df):
  df_m = df.T.reset_index()
  return df_m[df_m['period']!=df_m['period'].unique()[-1]] # Dropping current period
  
tb_1t_m = melt_table_1t(tb_1t)
tb_1t_m.tail()

In [None]:
# Creating melted table with totals for multiple choice questions
def melt_table_w_totals(df):
  df_m = pd.melt(df, id_vars='question', value_vars=[i for i in df.columns if i !='question'], value_name='answer')
  df_m = pd.merge(df_m, tb_1t_m, how='inner', on='period')
  df_m = df_m.assign(perc = round(df_m['answer']/df_m['total'], 2))
  return df_m
  
tb_1a_m = melt_table_w_totals(tb_1a)
tb_1a_m = tb_1a_m.assign(dekning = np.where(tb_1a_m['question'].isin(['Dekning', 'Data/surfekvalitet']), 'Dekning', 'Annet'))
tb_1a_m.tail()

In [None]:
# Creating melted table with totals for true/false question
tb_1b_m = melt_table_w_totals(tb_1b)
tb_1b_m.tail()

## Exporting results to local drive

In [None]:
# Exporting files to excel
with pd.ExcelWriter('2022_Q2_cleaned_survey.xlsx', engine='openpyxl') as writer:
  df1.to_excel(writer, sheet_name='00_clean_survey', index=False)
  df1_a.to_excel(writer, sheet_name='01_num_data_8q', index=False)
  tb_1a.to_excel(writer, sheet_name='02_table_8q', index=False)
  df1_b.to_excel(writer, sheet_name='03_num_data_2q', index=False)
  tb_1b.to_excel(writer, sheet_name='04_table_2q', index=False)
  tb_1t.to_excel(writer, sheet_name='05_period_totals', index=False)
  df1_c.to_excel(writer, sheet_name='06_text_data', index=False)
  
files.download('2022_Q2_cleaned_survey.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Data Visualization

In [16]:
# Importing visualization libraries
import plotly.express as px
import seaborn as sns
from ipywidgets import interact

In [None]:
#@interact(Columns = survey_col)
#def view_columns_proportion(Columns):
#    return df1[Columns].value_counts(normalize=True, dropna=False)

In [36]:
# List of questions for interactive plot
year_list = df1['year'].unique().tolist()
period_list = df1['period'].unique().tolist()

In [None]:
# Plotting slide 2
@interact(Period = period_list)
def plot_fig_1a(Period):
  df = tb_1a[['question', Period]].copy()
  df = df.assign(perc = round(df[Period]/tb_1t[Period][0], 2))
  df = df.sort_values(by='perc', ascending=False)
  fig_1a = px.bar(df, x='perc', y='question', title=f'{Period}',
                       text_auto='1%', width=400, height=500)
  fig_1a.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, yaxis=dict(autorange="reversed"), title_x=0.5, xaxis_tickformat='1%')
  fig_1a.update_traces(marker_color='#ecc541')
  fig_1a.show()

In [None]:
# Plotting slide 3
@interact(Period = period_list)
def plot_fig_1b(Period):
  df = tb_1b[['question', Period]].copy()
  df = df.assign(perc = round(df[Period]/tb_1t[Period][0], 2),
                 kontaktmodus = Period)
  df = df.sort_values(by='perc', ascending=False)
  colors_1b = {'Ble_kontaktet': '#ecc541', 'Initiativ_selv': '#929291'}
  fig_1b = px.bar(df, x='kontaktmodus', y='perc', color='question', title=f'{Period}', barmode='stack', 
                       text_auto='1%', width=400, height=500, color_discrete_map=colors_1b)
  fig_1b.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat='1%')
  fig_1b.show()

In [None]:
# Plotting slide 4
def plot_fig_1c():
  cat_ord_1c = {'question': ['Dekning', 'Bedre_tilbud', 'Data/surfekvalitet', 'Annet', 'Kundeservice',
                             'Utlandstjenesten', 'Privat_abo', 'Tvangsavvikling']}
  colors_1c = {'Dekning':'#ecc541', 'Bedre_tilbud':'#929291', 'Data/surfekvalitet':'#0080ff', 'Annet':'#be3e84', 'Kundeservice':'#987d27',
               'Utlandstjenesten':'#444e60', 'Privat_abo':'#00ffff', 'Tvangsavvikling':'#1c1915'}
  fig_1c = px.bar(tb_1a_m, x='period', y='perc', color='question', category_orders=cat_ord_1c, 
                  text_auto='1%', title=f'Avgangsundersøkelse over tid', width=800, height=500)#,color_discrete_map=colors_1c)
  fig_1c.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  fig_1c.show()

plot_fig_1c()

In [None]:
# Plotting slide 5
def plot_fig_1d():
  colors_1d = {'Ble_kontaktet': '#ecc541', 'Initiativ_selv': '#929291'}
  fig_1d = px.bar(tb_1b_m, x='period', y='perc', color='question', title=f'Kontakttype over tid',
                  text_auto='1%', width=800, height=500, color_discrete_map=colors_1d)
  fig_1d.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  fig_1d.show()

plot_fig_1d()

In [None]:
# Plotting slide 6
def plot_fig_1e():
  colors_1e = {'Dekning': '#ecc541', 'Annet': '#929291'}
  df = tb_1a_m.copy().groupby(by=['period', 'dekning']).agg(dekning_samlet = ('perc', 'sum')).reset_index()
  fig_1e = px.bar(df, x='period', y='dekning_samlet', color='dekning', title='Dekning kontra andre årsaker over tid', barmode='group',
                   text_auto='1%', width=800, height=500, color_discrete_map=colors_1e)
  fig_1e.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, yaxis_tickformat = '1%')
  #fig_1e.update_yaxes(range=[0.0, 1])
  fig_1e.show()

plot_fig_1e()

In [None]:
# Plotting slide 7
def plot_fig_1f():
  cat_ord_1f = {'question': ['Dekning', 'Bedre_tilbud', 'Data/surfekvalitet', 'Annet', 'Kundeservice',
                             'Utlandstjenesten', 'Privat_abo', 'Tvangsavvikling']}
  fig_1f = px.area(tb_1a_m, x='period', y='perc', facet_col='question', color='question', facet_col_wrap=2, category_orders=cat_ord_1f,
                  facet_col_spacing=0.1, title=f'Trend etter årsak', width=900, height=500)
  fig_1f.update_layout(title_font_size=16, title_x=0.5)
  fig_1f.update_yaxes(tickformat='1%')
  fig_1f.show()

plot_fig_1f()

In [None]:
# Plotting slide 8
def plot_fig_1g():
  df = df1[~(df1['period']=='2022-Q3')][['year','mh_context', 'month']].copy()
  df = df.groupby(by=['mh_context']).agg(responses = ('year', 'count')).reset_index()
  df = df.rename(columns={'mh_context': 'date'})
  fig_1g = px.histogram(df, x='date', y='responses', text_auto=True,
                        nbins=18, title=f'Antall svar over tid', width=800, height=500)
  fig_1g.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_1g.update_traces(marker_color='#ecc541')
  fig_1g.show()

plot_fig_1g()

In [None]:
# Plotting slide 9a
@interact(Year = year_list)
def plot_fig_1h(Year):
  df = df1[df1['year']==Year][['year', 'day']].copy()
  df = df.groupby(by=['year', 'day']).agg(responses = ('year', 'count')).reset_index()
  df = df.assign(day = df['day'].str[:3])
  cat_ord_1h = {'day': ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']}
  fig_1h = px.histogram(df, x='day', y='responses', text_auto=True, category_orders=cat_ord_1h,
                        title=f'{Year} - Svarer etter ukedager', width=400, height=500)
  fig_1h.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_1h.update_traces(marker_color='#ecc541')
  fig_1h.show()

In [None]:
# Plotting slide 9b
@interact(Year = year_list)
def plot_fig_1i(Year):
  df = df1[df1['year']==Year][['year', 'hour']].copy()
  df = df.groupby(by=['year', 'hour']).agg(responses = ('year', 'count')).reset_index()
  df = df.assign(perc = round(df['responses']/sum(df['responses']), 3))
  #return df
  fig_1i = px.histogram(df, x='hour', y='perc', text_auto='1%', nbins=12,
                        title=f'{Year} - Svarer etter timer', width=400, height=500)
  fig_1i.update_layout({'plot_bgcolor': '#ffffff'}, title_font_size=16, title_x=0.5, bargap=0.05)
  fig_1i.update_traces(marker_color='#ecc541')
  fig_1i.update_yaxes(tickformat='1%')
  fig_1i.show()