In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('bmh')

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [None]:
df_survey = pd.read_csv("", header=[0,1])
codebook = df_survey.columns.to_frame(index=0, name=['question', 'options'])

In [None]:
df_survey.head()

In [None]:
df_survey.shape

In [None]:
df_survey = pd.read_csv("", header=None, prefix='Q').iloc[2:]
df_survey.head()

In [None]:
df_survey['Q20'].dropna()

In [None]:
df_survey['Q20'] = df_survey['Q20'].astype(str)
df_survey = df_survey[~df_survey['Q20'].isin(['nan'])]

In [None]:
#pie chart 
temp_series = df_survey['Q20'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Employment Status Distribution'
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="employmentstatus")

In [None]:
temp_series = df_survey['Q9'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum())*100))

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(
    title='Management Type Distribution'
)
data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="management")

In [None]:
# scatter plot
cnt_srs = df_survey['Q15'].value_counts()

trace = go.Scatter(
    x=cnt_srs.index,
    y=cnt_srs.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 50,
        #color = np.random.randn(500), #set color equal to a variable
        color = cnt_srs.values,
        colorscale='Portland',
        showscale=True
    ),
)

layout = go.Layout(
    title='Tenure Distribution'
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="tenure")

In [None]:
# bar chart
net_promoter_map = {"1":1, "2":2, "3":3, "4":4, "5":5, "6":6, "7":7, "8":8, "9":9, "10":10}
df_survey["Q32_new"] = df_survey["Q32"].map(net_promoter_map)
cnt_srs = df_survey["Q32_new"].value_counts()

trace = go.Bar(
    x=cnt_srs.index,
    y=cnt_srs.values,
    marker=dict(
        color=cnt_srs.values,
        colorscale = 'Rainbow',
        reversescale = True
    ),
)

layout = go.Layout(
    title='How likely is it that you would recommend Maximus to a friend or colleague?'
)

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="PromoterScore")

In [None]:
# Generate a chart to visualize single-answer questions
def gen_chart_radiobutton(data, question_name, index, group_column_name):
    print("Number of answers in each group: ")
    print(data[[question_name, group_column_name]].groupby(group_column_name).count())
    i_counts = (
        data.groupby([group_column_name])[question_name]
        .value_counts(normalize=True)
        .rename("percentage(%)")
        .mul(100)
        .reset_index()
        .round(2)
    )

    listOfGroup = list(data.loc[:,group_column_name].unique())
    listOfGroup.sort()

    fig, ax = plt.subplots(figsize=(10, 8))

    fig = sns.barplot(
        x="percentage(%)",
        y=question_name,
        order=index,
        hue=group_column_name,
        hue_order=listOfGroup,
        data=i_counts,
    )

    plt.title(None)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="group")
    ax.set(xlim=(0, 100))
    ax.set_ylabel("")
    return plt.show()

In [None]:
index = ["Strongly disagree",
        "Disagree",
        "Neither agree nor disagree",
        "Agree",
        "Strongly agree"]

In [None]:
gen_chart_radiobutton(df_survey, "Q16", index, 'Q13')

In [None]:
def gen_table(data, group_name, col_range, group_column_name='Q13'):
    data_sub = data[data[group_column_name] == group_name].iloc[:, col_range].dropna(how="all")
    for var in data_sub.columns:
        data_sub[var] = data_sub[var].map(
            {
                "Strongly agree": 5,
                "Agree": 4,
                "Undecided": 3,
                "Disagree": 2,
                "Strongly disagree": 1,
            }
        )
    table = data_sub.describe().loc[["mean", "std", "count"]].T

    index = []

    for var in table.index:
        i = int(var[1:])
        index.append(codebook.iloc[i, 1])

    table["item"] = index

    # table["item_n"] = range(len(index),0,-1)
    table["item_n"] = range(0, len(index))
    return table

# Generate a chart to compare the importance of missing features across two groups
def compare_importance(data, groups, col_range, group_column_name='Q13'):

    group_name_to_describe_data = {}

    #Create a dictionary to save the summary data for each group
    for i, group_name in enumerate(groups):
        group_name_to_describe_data[group_name] = "data_describe_%s" % i

    for group_name in groups:
        table = gen_table(data, group_name, col_range, group_column_name)
        group_name_to_describe_data[group_name] = table

    #Get the item list and index
    items = table.item.tolist()
    item_n = table.item_n.tolist()
    
    # Visualize the mean value with the 95% confidence interval
    # Change the figsize if you have more yticks
    plt.figure(num=None, figsize=(10, 8), dpi=90, facecolor="w", edgecolor="k")
    ax = plt.axes()

    for i, group_name in enumerate(groups):
        plt.errorbar(
            group_name_to_describe_data[group_name]["mean"].astype(float),
            group_name_to_describe_data[group_name]["item_n"] - 0.1 * i,
            xerr=1.96
            * (
                group_name_to_describe_data[group_name]["std"].astype(float)
                / (group_name_to_describe_data[group_name]["count"] ** 0.5)
            ),
            fmt="o",
            elinewidth=1,
            capsize=4,
            marker="o",
            ms=4,
            label=group_name,
        )

    ax.set_xlim(1, 5)
    
    ax.set_yticks(item_n)
    ax.set_yticklabels(labels=items, fontdict={'fontsize':8})

    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="group")
    plt.title("Compare the mean values across groups (scale 1-5)", fontsize=15)

    return plt.show()

In [None]:
compare_importance(df_survey, ['x', 
                               'y', 
                               'z', ], range(#,#))

In [None]:
# bucket rating score

def net_promoter_score(rating):
    rating = int(rating)

    if rating < 7:
        bucket = 'Detractor'
    
    if rating in range(7, 9):
        bucket = 'Passive'
      
    if rating in range(9, 11):
        bucket = 'Promoter'
   
    return bucket

In [None]:
df_survey = df_survey[df_survey['Q32'].notna()]
df_survey['Q32'].head()

In [None]:
df_survey['NPS'] = df_survey['Q32'].apply(net_promoter_score)
df_survey['NPS'].head(3)

In [None]:
# Generate a chart to visualize single-answer questions
def gen_chart_radiobutton(data, question_name, category, group_column_name):
    print("Number of answers in each group: ")
    print(data[[question_name, group_column_name]].groupby(group_column_name).count())
    i_counts = (
        data.groupby([group_column_name])[question_name]
        .value_counts(normalize=True)
        .rename("percentage(%)")
        .mul(100)
        .reset_index()
        .round(2)
    )

    listOfGroup = list(data.loc[:,group_column_name].unique())
    listOfGroup.sort()

    fig, ax = plt.subplots(figsize=(10, 8))

    fig = sns.barplot(
        x="percentage(%)",
        y=question_name,
        order=category,
        hue=group_column_name,
        hue_order=listOfGroup,
        data=i_counts,
    )

    plt.title(None)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0, title="group")
    ax.set(xlim=(0, 100))
    ax.set_ylabel("")
    return plt.show()

In [None]:
category = ["Detractor",
        "Passive",
        "Promoter"]

In [None]:
gen_chart_radiobutton(df_survey, "NPS", category, "Q20")