# Stack Overflow Surveys

## 2. Data understanding

### 2.1. Load datasets

In [1]:
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.ticker import StrMethodFormatter
plt.style.use('./style/minimal.mplstyle')
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [None]:
root = 'data/zipped_folder/survey_results_public_'

df_11 = pd.read_csv(root + '2011.csv', header=1, dtype=str)
df_12 = pd.read_csv(root + '2012.csv', header=1, dtype=str)
df_13 = pd.read_csv(root + '2013.csv', header=1, dtype=str)
df_14 = pd.read_csv(root + '2014.csv', header=1, dtype=str)
df_15 = pd.read_csv(root + '2015.csv', header=1, dtype=str)
df_16 = pd.read_csv(root + '2016.csv')
df_17 = pd.read_csv(root + '2017.csv')
df_18 = pd.read_csv(root + '2018.csv')
df_19 = pd.read_csv(root + '2019.csv')
df_20 = pd.read_csv(root + '2020.csv')
df_21 = pd.read_csv(root + '2021.csv')

In [None]:
df_21.head()

### 2.2. Exploratory data analysis

Selecting questions:

1. How many caffeinated beverages per day?
2. Tabs or spaces? (2015 and 2017)
3. How much do you agree or disagree with the following statement? I want to go to Mars right now, even if there's a chance I never come back.
4. Star Wars or Star Trek?
5. Dogs or cats?
6. Do you believe in aliens?
7. How do you pronouce "GIF"?
8. Are you the "IT support person" for your family?

In [None]:
# How many caffeinated beverages per day?
q1 = df_15['How many caffeinated beverages per day?'].value_counts()
q1

In [None]:
# Tabs or spaces?
q2_15 = df_15['Tabs or Spaces'].value_counts()
q2_15

In [None]:
# Tabs or spaces?
q2_17 = df_17['TabsSpaces'].value_counts()
q2_17

In [None]:
q2_15.rename('2015', inplace=True)
q2_17.rename('2017', inplace=True)

q2 = pd.concat([q2_15, q2_17], axis=1)
q2

In [None]:
# How much do you agree or disagree with the following statement?
# I want to go to Mars right now, even if there's a chance I never come back.
q3 = df_16['agree_mars'].value_counts()
q3

In [None]:
# Star Wars or Star Trek?
q4 = df_16['star_wars_vs_star_trek'].value_counts()
q4

In [None]:
# Dogs or cats?
q5 = df_16['dogs_vs_cats'].value_counts()
q5

In [None]:
# Do you believe in aliens?
q6 = df_16['aliens'].value_counts()
q6

In [None]:
# How do you pronouce "GIF"?
q7 = df_17['PronounceGIF'].value_counts()
q7

In [None]:
# Are you the "IT support person" for your family?
q8 = df_19['ITperson'].value_counts()
q8

### 2.3. Null values

In [None]:
dfs_columns = [(df_15, 'How many caffeinated beverages per day?'),
               (df_15, 'Tabs or Spaces'),
               (df_17, 'TabsSpaces'),
               (df_16, 'agree_mars'),
               (df_16, 'star_wars_vs_star_trek'),
               (df_16, 'dogs_vs_cats'),
               (df_16, 'aliens'),
               (df_17, 'PronounceGIF'),
               (df_19, 'ITperson')]

In [None]:
space = '   '
print('Respondents:'
      + space
      + 'Not nulls (abs):'
      + space
      + 'Nulls (abs):'
      + space
      + 'Nulls (per):'
      + space
      + 'Question:')

for df, column in dfs_columns:
    respondents = df.shape[0]
    null_per = df[column].isnull().mean()
    null_abs = respondents*null_per
    not_null_abs = respondents - null_abs
    
    print(f'{respondents:>12,}'  # 12 is the length of 'Respondents:'
          + space
          + f'{not_null_abs:>16,.0f}'  # 16 is the length of 'Not nulls (abs):'
          + space
          + f'{null_abs:>12,.0f}'  # 12 is the length of 'Nulls (abs):'
          + space
          + f'{null_per:>12.2%}'  # 12 is the length of 'Nulls (per):'
          + space
          + f'{column}')

Some questions have a relatively high percentage of null values, like Star Wars vs. Star Trek, with 38% of nulls. However, the absolute number is still high enough (34,398 respondents, for the previous example) to provide representative descriptive statistics. Since any input method for the missing data would add a bias, the null values will just be removed.

## 3. Data preparation

### 3.1. Adjusting labels

In [None]:
# Dropping uncomparable items (2017 survey doesn't have 'Huh?')
q2.drop(labels=['Huh?'], inplace=True)

In [None]:
# Standardizing and simplifiyng labels
q2.rename({'It depends': 'Both'}, inplace=True)
q4.rename({'Star Wars; Star Trek': 'Both'}, inplace=True)
q5.rename({'Other (please specify)': 'Other'}, inplace=True)
q6.rename({'Other (please specify)': 'Other'}, inplace=True)
q8.rename({'SIGH': 'Sigh'}, inplace=True)

In [None]:
# Combining rows with same label to remove NaNs
q2 = q2.groupby(q2.index).sum()

In [None]:
# Ordering in a more intuitive way
q1 = q1.reindex(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'More than 10'])
q2 = q2.reindex(['Tabs', 'Spaces', 'Both'])
q3 = q3.reindex(['Disagree completely', 'Disagree somewhat', 'Neutral', 'Agree somewhat', 'Agree completely'])
q8 = q8.reindex(['Yes', 'Also Yes', 'Sigh', 'Fortunately, someone else has that title'])

In [None]:
# Breaking label lines to fit on the charts
q1.rename({'More than 10': 'More\nthan\n10'}, inplace=True)

q3.rename({'Disagree completely': 'Disagree\ncompletely',
           'Disagree somewhat': 'Disagree\nsomewhat',
           'Agree somewhat': 'Agree\nsomewhat',
           'Agree completely': 'Agree\ncompletely'}, inplace=True)

q4.rename({'Star Wars': 'Star\nWars',
           'Star Trek': 'Star\nTrek'}, inplace=True)

q7.rename({'With a hard "g," like "gift"': 'With a hard "g",\nlike "gift"',
           'With a soft "g," like "jiff"': 'With a soft "g",\nlike "jiff"',
           'Enunciating each letter: "gee eye eff"': 'Enunciating each letter:\n"gee eye eff"'}, inplace=True)

q8.rename({'Fortunately, someone else has that title': 'Fortunately, someone\nelse has that title'}, inplace=True)

### 3.2. Getting x and y values

In [None]:
q0_x = list(range(2011, 2022))

surveys = [df_11, df_12, df_13, df_14, df_15, df_16, df_17, df_18, df_19,
           df_20, df_21]
q0_y = []
for survey in surveys:
    rows = survey.shape[0]
    q0_y.append(rows)

In [None]:
# The convention used is labels as x and values as y
q1_x = q1.index.tolist()
q1_y = q1.to_list()

q2_x = q2.columns.tolist()
q2_y_t = q2.loc['Tabs'].tolist()
q2_y_s = q2.loc['Spaces'].tolist()
q2_y_b = q2.loc['Both'].tolist()
q2_l = q2.index.tolist()  # Legend

q3_x = q3.index.tolist()
q3_y = q3.to_list()

q4_x = q4.index.tolist()
q4_y = q4.to_list()

q5_x = q5.index.tolist()
q5_y = q5.to_list()

q6_x = q6.index.tolist()
q6_y = q6.to_list()

q7_x = q7.index.tolist()
q7_y = q7.to_list()

q8_x = q8.index.tolist()
q8_y = q8.to_list()

### 3.3. Transforming into percentages

In [None]:
def get_percentages(abs_values):
    """
    Transform a list of absolute numbers into percentages of the total sum.
    
    Parameters:
        abs_values (list of floats): A list with absolute numbers.
    
    Return:
        per_values (list of floats): A list with the percentage of each
            absolute number regarding the total sum.
    """
    
    np_abs_values = np.array(abs_values)
    total = np.sum(np_abs_values, axis=0)
    per_values = list(np_abs_values/total)

    return per_values

In [None]:
q1_y_per = get_percentages(q1_y)
q3_y_per = get_percentages(q3_y)
q4_y_per = get_percentages(q4_y)
q5_y_per = get_percentages(q5_y)
q6_y_per = get_percentages(q6_y)
q7_y_per = get_percentages(q7_y)
q8_y_per = get_percentages(q8_y)

In [None]:
q2_y_t_abs = np.array(q2_y_t)
q2_y_s_abs = np.array(q2_y_s)
q2_y_b_abs = np.array(q2_y_b)

total = np.sum([q2_y_t_abs, q2_y_s_abs, q2_y_b_abs], axis=0)

q2_y_t_per = list(q2_y_t_abs/total)
q2_y_s_per = list(q2_y_s_abs/total)
q2_y_b_per = list(q2_y_b_abs/total)

q2_y_per = [q2_y_t_per, q2_y_s_per, q2_y_b_per]

## 4. Descriptive statistics

### 4.1. Support functions

In [None]:
def calc_bar_positions(classes, space=1/2):
    """
    Calculate the coordinates and size of bars, given a desired space.
    
    Parameters:
        classes (list of strings): A list of the major tick labels for the x
            or y axis. In other words, the classes.
        space (float): The space between bars, in terms of the size (width or
            height) of them.
    
    Return:
        coords (list of floats): A list with the x or y coordinates to
            position the bars.
        size (float): The width of the bar (x axis) or the height of the bar
            (y axis).
    
    Example:
        Let's say there are 2 classes in the y axis, and we want to generate y
        coordinates to position them, with spaces with 1/4 of their height
        between them and the axis limits. First, we need to calculate the
        space unit, with the following equation:
        
        space_unit = 1/(1 + 2*(space) + (n - 1)*(1 + space))
        
        Where:
        
        Space unit: The size of a single space. It's the fraction that a bar
            occupies on an axis with a size of 1 and a given ammount of space
            between the bars.
        Numerator: The total size of the x or y axis, always 1.
        Denominator: The number of spaces to divide the axis by.
        
        This would give us as a space unit of 1/2.75. The details of how to
        calculate the denominator are:
        
        1/4 (from the bottom of the y axis to the bottom of the first bar)
        + 1 (height of the first bar)
        + 1/4 (space between bars)
        + 1 (height of the second bar)
        + 1/4 (from the top of the second bar to the top of the y axis)
        = 2.75
        
        Then, we would need to use the space unit to calculate the x or y
        coordinate for each bar, with the following equation:
        
        coordinate = space_unit*(1/2 + space + i*(1 + space))
        
        To calculate the y coordinate for the second bar, it would be:
        
        1/4 (from the bottom of the y axis to the bottom of the first bar)
        + 1 (height of the first bar)
        + 1/4 (space between bars)
        + 1/2 (half of the height of the second bar)
        = 2
    """
    n = len(classes)
    space_unit = 1/(1 + 2*(space) + (n - 1)*(1 + space))
    
    # List with the coordinate of every bar
    coords = []
    for i in range(n):
        # Calculate the coordinate to position the bar
        coords.append(space_unit*(1/2 + space + i*(1 + space)))

    size = space_unit
    
    return coords, size

In [None]:
def annotate_plot(text, xy, xytext, **kwargs):
    """
    Annotate the point xy with text and offset it.
    
    Parameters:
        text (string): The text to be annotated.
        xy (tuple of floats): The x and y coordinates to place the text.
        xytext (tuple of floats): The ammount of x and y points to offset the
            text.
        **kwargs: Accepts any keyword argument from the annotate function.
    
    Return:
        None.
    """
    plt.gca().annotate(text,  # This is the text
                       xy,  # These are the coordinates to position the label
                       textcoords='offset points',  # How to position the text
                       xytext=xytext,  # Distance from text to points (x, y)
                       fontsize=11,
                       **kwargs)
    return

In [None]:
def style_vertical():
    """Apply a minimalist style to a vertical bar chart plot"""
    plt.gca().yaxis.set_major_formatter(StrMethodFormatter('{x:,.0%}'))
    plt.gca().tick_params(bottom=False, width=1)
    plt.gca().spines['bottom'].set_visible(False)
    return

In [None]:
def style_horizontal():
    """Apply a minimalist style to a horizontal bar chart plot"""
    plt.gca().xaxis.set_major_formatter(StrMethodFormatter('{x:,.0%}'))
    plt.gca().tick_params(left=False, width=1)
    plt.gca().spines['left'].set_visible(False)
    return

In [None]:
def plot_line(x, y, title, xlabel, ylabel, xlim, ylim, **kwargs):
    """
    Plot a line chart with a minimalist style.
    
    Parameters:
        x (list of floats): A list with the x values to plot.
        y (list of floats): A list with the y values to plot.
        title (string): The title of the chart.
        xlabel (string): The name of the x axis metric.
        ylabel (string): The name of the y axis metric.
        xlim (tuple of floats): A tuple with the left and right limits for
            the x axis.
        ylim (tuple of floats): A tuple with the bottom and top limits for
            the y axis.
        **kwargs: Accepts any keyword argument from the figure function.
    
    Return:
        fig (figure): A figure with the final plot stored.
    """
    fig = plt.figure(**kwargs)
    
    plt.plot(x, y)
    
    plt.title(title)
    # Hide the spine inside the 0.1 x axis margin
    # Must come before plt.xlim()
    plt.gca().spines['bottom'].set_bounds(xlim)
    # Add a 0.1 margin to avoid trimming the round end of the line
    xlim = (xlim[0], xlim[1] + 0.1)
    plt.xlim(xlim)
    plt.ylim(ylim)
    # Right align the xlabel to the last major tick label
    plt.xlabel(xlabel, va='top', ha='right', x=1.015)
    # Vertically align the ylabel to the last major tick label
    plt.ylabel(ylabel, va='bottom', ha='right', y=1.026)
    # Show major ticks as multiples of 1 on the x axis
    plt.gca().xaxis.set_major_locator(plt.MultipleLocator(1))
    plt.tight_layout(h_pad=4, w_pad=4)
    plt.show()
    
    return fig

In [None]:
def plot_bar(x, y, title, horizontal=False, xlim=(0, 1), ylim=(0, 1),
             space=1/2, **kwargs):
    """
    Plot a bar chart with a minimalist style.
    
    Parameters:
        x (list of strings): A list with the name of the labels of the
            classes.
        y (list of floats): A list with the y values of each class.
        title (string): The title of the chart.
        horizontal (bool): The orientation of the chart. The default is False,
            which is vertical.
        xlim (tuple of floats): A tuple with the left and right limits for
            the x axis. The default is (0, 1), useful for percentages.
        ylim (tuple of floats): A tuple with the bottom and top limits for
            the y axis. The default is (0, 1), useful for percentages.
        space (float): The space between bars, in terms of the size (width or
            height) of them.
        **kwargs: Accepts any keyword argument from the figure function.
    
    Return:
        fig (figure): A figure with the final plot stored.
    """
    fig = plt.figure(**kwargs)
    coords, size = calc_bar_positions(x, space)
    
    if horizontal:
        plt.barh(coords[::-1], y, height=size)
        plt.yticks(coords[::-1], x)
        style_horizontal()
    else:
        plt.bar(coords, y, width=size)
        plt.xticks(coords, x)
        style_vertical()
    
    # Styling and showing the chart
    plt.title(title)
    plt.xlim(xlim)
    plt.ylim(ylim)
    plt.tight_layout(h_pad=4, w_pad=4)
    plt.show()
    
    return fig

In [None]:
def plot_slope(dates, series, legends, title, ylim=(0, 1), **kwargs):
    """
    Plot a slopegraph with a minimalist style.
    
    Parameters:
        dates (list of strings): A list with two date labels, always.
        series (list of list of floats): A list with other lists inside,
            each one with the y values from a specific class.
        legends (list of strings): A list with the legend of each class.
        title (string): The title of the chart.
        ylim (tuple of floats): A tuple with the bottom and top limits for
            the y axis. The default is (0, 1), useful for percentages.
        **kwargs: Accepts any keyword argument from the figure function.
    
    Return:
        fig (figure): A figure with the final plot stored.
    """
    fig = plt.figure(**kwargs)
    
    items = zip(series, legends)
    for single_series, legend in items:
        
        plt.plot(dates, single_series, 'o-', markersize=10)
        
        # Pick the label color from the style sheet file
        color = mpl.rcParams['axes.labelcolor']
        labels = zip(dates, single_series)
        for i, (x, y) in enumerate(labels):
            label = '{:.0%}'.format(y)
            
            if i % 2 == 0:  # First date label
                annotate_plot(label, (x, y), (-10, 0), ha='right',
                              va='center', color=color)
                annotate_plot(legend, (x, y), (-45, 0), ha='right',
                              va='center', color=color)
            else:  # Second date label
                annotate_plot(label, (x, y), (10, 0), ha='left',
                              va='center', color=color)
    
    # Styling and showing the chart
    plt.title(title)
    plt.ylim(ylim)
    plt.margins(x=0.05)
    plt.tick_params(left=False, width=1)
    plt.gca().yaxis.set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().spines['bottom'].set_bounds((0, 1))
    plt.tight_layout(h_pad=4, w_pad=4)
    plt.show()
    
    return fig

In [None]:
def plot_hor_stacked_bars(values, legends, title, bar_colors, leg_colors,
                          xlim=(0, 1), ylim=(0, 1), **kwargs):
    """
    Plot horizontal stacked bars chart with a minimalist style.
    
    Parameters:
        values (list of floats): A list with the size of each bar.
        legends (list of strings): A list with the legend of each bar.
        bar_colors (list of strings): A list with the bar colors in the
            hexadecimal format, including the #.
        leg_colors (list of strings): A list with legend colors in the
            hexadecimal format, including the #.
        xlim (tuple of floats): A tuple with the left and right limits for
            the x axis. The default is (0, 1), useful for percentages.
        ylim (tuple of floats): A tuple with the bottom and top limits for
            the y axis. The default is (0, 1), useful for percentages.
        **kwargs: Accepts any keyword argument from the figure function.
    
    Return:
        fig (figure): A figure with the final plot stored.
    """
    fig = plt.figure(**kwargs)
    
    # Store the position from which the bars begin
    left = 0
    zipped = zip(values, legends, bar_colors, leg_colors)
    for value, legend, bar_color, leg_color in zipped:
        
        rect = plt.barh(y=3/4, width=value, height=1, left=left,
                        color=bar_color)
        
        # Put the legend in the middle of each bar
        xy= (rect[0].get_x() + rect[0].get_width()/2,
             rect[0].get_y() + rect[0].get_height())

        annotate_plot(legend, xy, (0, 5), ha='center', va='bottom',
                      color=leg_color)
        left += value
    
    # Styling and showing the chart
    style_horizontal()
    plt.title(title)
    plt.xlim(xlim)
    plt.ylim(ylim)
    plt.gca().xaxis.set_major_locator(plt.MultipleLocator(0.1))
    plt.gca().yaxis.set_visible(False)
    plt.tight_layout(h_pad=4, w_pad=4)
    plt.show()
    
    return fig

### 4.2. Visualizations

In [None]:
# Titles
t0 = 'How many survey respondents per year?'
t1 = 'How many caffeinated beverages per day?'
t2 = 'Tabs or spaces?'
t3 = "How much do you agree or disagree with the following statement?\nI want to go to Mars right now, even if there's a chance I never come back."
t4 = 'Star Wars or Star Trek?'
t5 = 'Dogs or cats?'
t6 = 'Do you believe in aliens?'
t7 = 'How do you pronounce "GIF"?'
t8 = 'Are you the "IT support person" for your family?'

#### 4.2.1. Survey respondents

In [None]:
fig0 = plot_line(q0_x,
                 q0_y,
                 t0,
                 xlabel='Year',
                 ylabel='Respondents',
                 xlim=(2011, 2021),
                 ylim=(0, 120000),
                 figsize=(10, 4.5))

#### 4.2.2. Caffeinated beverages

In [None]:
fig1 = plot_bar(q1_x,
                q1_y_per,
                t1,
                ylim=(0, 0.25),
                space=1/3)

#### 4.2.3. Tabs or spaces

In [None]:
fig2 = plot_slope(q2_x,
                  q2_y_per,
                  q2_l,
                  t2,
                  ylim=(0, 0.5),
                  figsize=(4, 4.5))

#### 4.2.4. Desire to go to Mars

In [None]:
# Defining a color pallete to plot the chart
bar_colors = ['#888888', '#B4B4B4', '#E0E0E0', '#F69B51', '#F2740E']

# C2C2C2 is darker and more readable than E0E0E0
leg_colors = bar_colors.copy()
leg_colors[2] = '#C2C2C2'

fig3 = plot_hor_stacked_bars(q3_y_per,
                             q3_x,
                             t3,
                             bar_colors,
                             leg_colors,
                             ylim=(0, 2.2),
                             figsize=(11, 3))

#### 4.2.5. Star Wars or Star Trek

In [None]:
fig4 = plot_bar(q4_x,
                q4_y_per,
                t4,
                ylim=(0, 0.7),
                figsize=(3.4, 4.5))

#### 4.2.6. Dogs or cats

In [None]:
fig5 = plot_bar(q5_x,
                q5_y_per,
                t5,
                ylim=(0, 0.5),
                figsize=(3.4, 4.5))

#### 4.2.7. Aliens belief

In [None]:
fig6 = plot_bar(q6_x,
                q6_y_per,
                t6,
                ylim=(0, 0.6),
                figsize=(3.4, 4.5))

#### 4.2.8. GIF pronunciation

In [None]:
fig7 = plot_bar(q7_x,
                q7_y_per,
                t7,
                horizontal=True,
                xlim=(0, 0.7),
                figsize=(8, 4))

#### 4.2.9. IT support person

In [None]:
fig8 = plot_bar(q8_x,
                q8_y_per,
                t8,
                horizontal=True,
                xlim=(0, 0.6),
                figsize=(8, 4))

### 4.3. Export images

In [None]:
figures = [fig0, fig1, fig2, fig3, fig4, fig5, fig6, fig7, fig8]

file_names = ['q0_survey_respondents',
              'q1_caffeinated_beverages',
              'q2_tabs_or_spaces',
              'q3_desire_to_go_to_mars',
              'q4_star_wars_or_star_trek',
              'q5_dogs_or_cats',
              'q6_aliens_belief',
              'q7_gif_pronunciation',
              'q8_it_support_person'
]

In [None]:
for fig, file_name in zip(figures, file_names):
    fig.savefig(f'images/{file_name}.png', bbox_inches='tight', dpi=500)