id: C001

In [2]:
import pandas as pd
import numpy as np
import configparser

question_id = 'C001'
#question_title='How were you made aware of the opportunity to do a PhD at the HPI?'

#-----------------------------------------------------------------

config = configparser.ConfigParser()
config.read('config.ini')
data_folder = config['file.loading']['data_folder_path']
question_index_file_name = config['file.loading']['question_index_file_name']
data_file_name = config['file.loading']['data_file_name']
suffix = config['file.loading']['suffix']

#----------------------------------------------------------------
#LOAD QUESTION INDEX (Questions id, Question text, Answer alternatives, Details)
question_index_path = f'{data_folder}{question_index_file_name}'
id_data_frame = pd.read_csv(question_index_path,encoding = "ISO-8859-1")
filtered_df = id_data_frame[id_data_frame['ID']==question_id]

question_title = filtered_df[['Question']]

print("ID:"+question_id+" Question Title:" + question_title)

#-------------------------------------------------------------------
#LOAD Alternatives for the Question into a dictionary

# Split the string into key-value pairs
answer_options = filtered_df['Alternatives']
print(answer_options)

# Split the string into key-value pairs and flatten the list of lists (because Pandas DataFrame is horrible)
all_options_list = [option for sublist in answer_options.str.split(';') for option in sublist]
print(all_options_list)

for pair in all_options_list:
    print("pair= "+pair)

if(all_options_list!=None):
    # Create a dictionary to store the key-value pairs
    options_dict = {}

    # Iterate over the pairs and populate the dictionary
    for pair in all_options_list:
        key, value = pair.split(":")
        options_dict[int(key)] = value

    # Print the resulting dictionary
    print(options_dict)

#----------------------------------------------------------------
#LOAD DATA FILE
file_name = f'{data_path}{question_id}{sep}{question_title}{suffix}'
data_frame = pd.read_csv(file_name)
tables_path = f'{file_path}latex_tables/{question_id[:2]}/'
tables_file_name = f'{question_id}{sep}{question_title}'

program_column = 'Enrollment status:'


#----------------------------------------------------------------
#LOAD DATA FILE
#sep ='_'
#program_column = 'Enrollment status:'
#file_path = 'C:/Users/Christian/Documents/GitHub/'
#data_path = f'{file_path}data/'
#file_name = f'{data_path}{question_id}{sep}{question_title}{suffix}'
#data_frame = pd.read_csv(file_name)
#tables_path = f'{file_path}latex_tables/{question_id[:2]}/'
#tables_file_name = f'{question_id}{sep}{question_title}'

#----------------------------------------------------------------
#RENAMING 
#DataFrame Column with "Options" instead of Question Text
data_frame = data_frame.rename(columns={data_frame.columns[0]: "Options"})
data_frame = data_frame.rename(columns={data_frame.columns[1]: "Comments"})
#Make Program name shorter
data_frame = data_frame.rename(columns={program_column: "Enrollment"})
data_frame["Enrollment"] = data_frame["Program"].replace("University of Potsdam,Another University,Not Enrolled")
data_frame["Enrollment"] = data_frame["Enrollment"].replace("Other","Others")
#--------------

# Set the style for aligning columns to the left
styles = [
    {
        'selector': 'th',
        'props': [('text-align', 'left')]
    },
    {
        'selector': 'td',
        'props': [('text-align', 'left')]
    }
]

# Apply the style to the DataFrame
#df_styled = data_frame.style.set_table_styles(styles)
#display(df_styled)

                                             Question
18  ID:C001 Question Title:How were you made aware...
18    C001_01:Web;C001_02:Newsletters;C001_03:Academ...
Name: Alternatives, dtype: object
18    [C001_01:Web, C001_02:Newsletters, C001_03:Aca...
Name: Alternatives, dtype: object
['C001_01:Web', 'C001_02:Newsletters', 'C001_03:Academic conferences', 'C001_04:Friends/colleagues', 'C001_05:I completed my Bachelor/Master at HPI', 'C001_06:Other']
pair= C001_01:Web
pair= C001_02:Newsletters
pair= C001_03:Academic conferences
pair= C001_04:Friends/colleagues
pair= C001_05:I completed my Bachelor/Master at HPI
pair= C001_06:Other


### Answer count and percentages

In [None]:
import module_table_writer as table_writer
# ALL:
latex_table = table_writer.display_value_counts(data_frame, question_id, 
                                                question_title, filter_value='All', styles=None)
table_writer.table_to_file(latex_table,tables_path,tables_file_name)

# Filter by Program
program_name_array = data_frame['Program'].unique()
new_program_array= np.setdiff1d(program_name_array, ['Not answered'])

for program_name in new_program_array:
    latex_table = table_writer.display_value_counts(data_frame, question_id, question_title, 
                     filter_column= 'Program', 
                     filter_value= program_name, 
                     styles=None)
    table_writer.table_to_file(latex_table,tables_path,tables_file_name)


### Free text answers

In [None]:
df_text = pd.DataFrame(data_frame[['Comments','Program']]).dropna()
df_text = df_text.sort_values(by=['Program'])
#df_styled = df_text.style.set_table_styles(styles)

table_caption=f'Comments for {question_id}-{question_title}'
label_name = f'{question_id}-FreeText_table'
latex_table = df_text.to_latex(index=False,
                               caption=table_caption, 
                               label=label_name,
                               column_format='@{}p{0.65\\textwidth}p{0.35\\textwidth}')

table_writer.table_to_file(latex_table,tables_path,tables_file_name)


### Making sense of the data

#### Surprising findings:
- Number of recommendations from friends and colleagues (maybe a program to incentivize that?)
- Web pages or search amounts to more than 34%, which is also surprising.
- Low proportion on directed/planned initiatives like conferences and magazines.