<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/quarterly_survey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Survey data preparation

## Importing libraries

In [1]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

In [3]:
# Storing loaded data to a pandas dataframe
import io
df1 = pd.read_excel((io.BytesIO(uploaded1['survey_Q2.xlsx'])))

In [4]:
# Asserting the loaded data is correct
def check_columns():
  survey_col = ['answer_form_id', 'is_anonymous',  'cust_domain_id', 'mh_customer_id', 'mh_context', 'channel_name', 'broadcast_id', 'completed',
                'answered_timestamp', 'Q1', 'Q2_A1',  'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q3', 'Q4_A1', 'Q4_A2', 'Q5']
  assert df1.columns.to_list() == survey_col, f'Columns should be: {[i for i in survey_col]}'
  print(f'The file contains all correct columns')

check_columns()

The file contains all correct columns


## Data cleaning

In [5]:
# Converting timestamps to datetime
df1 = df1.assign(mh_context = pd.to_datetime(df1['mh_context']),
                 answered_timestamp = pd.to_datetime(df1['answered_timestamp']))

In [6]:
# Sorting dataframe by datetime
df1 = df1.sort_values(by='mh_context').reset_index(drop=True)

In [7]:
# Assigning categories from datetime
df1 = df1.assign(year = df1['mh_context'].dt.year,
                 quarter = df1['mh_context'].dt.quarter,
                 month = df1['mh_context'].dt.month_name(),
                 weeknum = df1['mh_context'].dt.isocalendar().week,
                 day = df1['mh_context'].dt.day_name(),
                 hour = df1['answered_timestamp'].dt.hour
                 )

In [8]:
# Replacing quarter numbers to string
quarter_dict = {1: 'Q1', 2: 'Q2', 3: 'Q3', 4: 'Q4'}
df1['quarter'].replace(quarter_dict, inplace=True)

In [None]:
# Checking if both date fields are equal
df1 = df1.assign(date1 = df1['mh_context'].astype(str).str[:11].str.strip(),
                 date2 = df1['answered_timestamp'].astype(str).str[:11].str.strip())
df1 = df1.assign(date_check = np.where(df1['date1'] == df1['date2'], 1, 0))
df1['date_check'].value_counts(normalize=True)

In [9]:
# Replacing non unicode characters
df1.replace(to_replace=r'Ã¥', value='å', regex=True, inplace=True)
df1.replace(to_replace=r'Ã¦', value='Å', regex=True, inplace=True)
df1.replace(to_replace=r'Ã.', value='ø', regex=True, inplace=True)

In [10]:
# Replacing X values with 1 and missing values with 0
x_cols = ['Q2_A1', 'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q4_A1', 'Q4_A2']
df1[x_cols] = df1[x_cols].copy().replace('X', value=1)
#df1[x_cols] = df1[x_cols].copy().fillna(0)
#df1['Q3'] = df1['Q3'].copy().fillna('Ingen treff')

In [None]:
df1['Q2_A1'].value_counts(normalize=True, dropna=False)

In [11]:
# Melting dataframe questions to long format
melt1_cols = ['answer_form_id', 'is_anonymous',  'cust_domain_id', 'mh_customer_id', 'mh_context', 'channel_name', 'completed',
               'year',	'quarter',	'month',	'weeknum',	'day',	'hour']
melt2_cols = ['Q1', 'Q2_A1',  'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q3', 'Q4_A1', 'Q4_A2', 'Q5']
df1_melt = pd.melt(df1, id_vars=[i for i in melt1_cols],
        value_vars=[i for i in melt2_cols], var_name='question', value_name='answer').dropna(axis=0, subset=['answer'])

In [82]:
# Splitting melted dataframe into numeric df and text df
df1_a = df1_melt[df1_melt['question'].isin(['Q2_A1', 'Q2_A2', 'Q2_A3', 'Q2_A4', 'Q2_A5', 'Q2_A7', 'Q2_A8', 'Q2_A9', 'Q4_A1', 'Q4_A2'])].copy()
df1_b = df1_melt[df1_melt['question'].isin(['Q1', 'Q3', 'Q5'])].copy()

In [None]:
df1_b['question'].unique()

In [None]:
df1_a.info()

## Exporting results to local drive

In [75]:
# Exporting files to excel
with pd.ExcelWriter('cleaned_survey.xlsx', engine='openpyxl') as writer:
  df1.to_excel(writer, sheet_name='01_clean_survey', index=False)
  df1_a.to_excel(writer, sheet_name='02_numeric', index=False)
  df1_b.to_excel(writer, sheet_name='03_free_text', index=False)
files.download('cleaned_survey.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Data Visualization

In [None]:
#Upgrading Plotly
!pip install plotly --upgrade

In [13]:
# Importing visualization libraries
import plotly.express as px
import seaborn as sns
from ipywidgets import interact

In [None]:
@interact(Columns = survey_col)
def view_columns_proportion(Columns):
    return df1[Columns].value_counts(normalize=True, dropna=False)

In [15]:
# List of questions for interactive plot
year_1 = df1['year'].unique()
quarter_1 = ['Q1', 'Q2', 'Q3', 'Q4']

In [None]:
sns.histplot(x='hour', data=df1_a[(df1_a['year']==2021) & (df1_a['quarter']=='Q1') & (df1_a['question']=='Q2_A1')])
plt.show()

In [None]:
# Plotting histogram of survey data with plotly express
@interact(Year = year_1, Quarter = quarter_1)
def plot_melted_survey(Year, Quarter):
  df = df1_a[(df1_a['year']==Year) & (df1_a['quarter']==Quarter)].copy()
  fig_2a = px.histogram(df, x='answer', color='question', barmode='group',
                        title=f'{Year} - {Quarter}', text_auto=True, width=800, height=500)
  fig_2a.update_layout(title_font_size=12)
  fig_2a.show()

In [None]:
# Plotting slide 2a
@interact(Year = year_1, Quarter = quarter_1)
def plot_slide_2a(Year, Quarter):
  mask_graph = ((df1_a['year']==Year) & (df1_a['quarter']==Quarter) & ~(df1_a['question'].isin(['Q4_A1', 'Q4_A2'])))
  mask_len = ((df1_a['year']==Year) & (df1_a['quarter']==Quarter))
  df = df1_a[mask_graph].copy()
  num_resp = len(df1_a[mask_len]['answer_form_id'].unique())
  df_gr = df.groupby(by=['year', 'quarter', 'question']).agg(cat_count = ('answer', 'sum')).reset_index()
  df_gr = df_gr.assign(perc = round((df_gr['cat_count']/num_resp),2))
  df_gr = df_gr.sort_values(by='perc', ascending=False)
  reason_dict = {'Q2_A1':'Bedre_tilbud', 'Q2_A2':'Tvangsavvikling', 'Q2_A3':'Privat_abo', 'Q2_A4':'Dekning',
                 'Q2_A5':'Data/surfekvalitet', 'Q2_A7':'Utlandstjenesten', 'Q2_A8':'Kundeservice', 'Q2_A9':'Annet'}
  df_gr['question'].replace(to_replace=reason_dict, inplace=True)
  fig_slide_2a = px.bar(df_gr, x='perc', y='question', title=f'Year: {Year} - {Quarter}',
                       text_auto='1%', width=400, height=500)
  fig_slide_2a.update_layout({'plot_bgcolor': '#FFFFFF'}, title_font_size=16, yaxis=dict(autorange="reversed"), title_x=0.5)
  fig_slide_2a.update_traces(marker_color='#ECC541')
  fig_slide_2a.show()

In [None]:
# Plotting slide 2b
@interact(Year = year_1, Quarter = quarter_1)
def plot_slide_2b(Year, Quarter):
  mask_graph = ((df1_a['year']==Year) & (df1_a['quarter']==Quarter) & (df1_a['question'].isin(['Q4_A1', 'Q4_A2'])))
  mask_len = ((df1_a['year']==Year) & (df1_a['quarter']==Quarter))
  df = df1_a[mask_graph].copy()
  num_resp = len(df1_a[mask_len]['answer_form_id'].unique())
  df_gr = df.groupby(by=['year', 'quarter', 'question']).agg(cat_count = ('answer', 'sum')).reset_index()
  df_gr = df_gr.assign(perc = round((df_gr['cat_count']/num_resp),2))
  df_gr = df_gr.sort_values(by='perc', ascending=False)
  reason_dict = {'Q4_A1':'Ble_kontaktet', 'Q4_A2':'Initiativ_selv'}
  df_gr['question'].replace(to_replace=reason_dict, inplace=True)
  #return df_gr
  fig_slide_2b = px.bar(df_gr, x='quarter', y='perc', title=f'Year: {Year} - {Quarter}', barmode='stack', 
                       text_auto='1%', width=400, height=500)
  fig_slide_2b.update_layout({'plot_bgcolor': '#FFFFFF'}, title_font_size=16, title_x=0.5)
  fig_slide_2b.update_traces(marker_color= ['#ECC541', '#929291'])
  fig_slide_2b.show()