id: C001

In [2]:
import pandas as pd
import numpy as np
import configparser
import module_table_writer as table_writer

question_id = "C001"
#question_title='How were you made aware of the opportunity to do a PhD at the HPI?'
enrollment_column = "B003"
gender_column = "B004"
enrollment_column_name = "Enrolled"
gender_column_name = "Gender"

#-----------------------------------------------------------------
# LOAD CONFIGURATION FILE

config = configparser.ConfigParser()
config.read('config.ini')
project_path = config['file.loading']['project_path']
data_folder = config['file.loading']['data_folder']
latex_tables_folder = config['file.loading']['latex_tables_folder']
question_index_file = config['file.loading']['question_index_file']
data_file = config['file.loading']['data_file']
sep = config['file.loading']['sep']

#----------------------------------------------------------------
#LOAD QUESTION INDEX (Questions id, Question text, Answer alternatives, Details)
question_index_path = f'{project_path}/{data_folder}/{question_index_file}'
#print(question_index_path)
id_data_frame = pd.read_csv(question_index_path,encoding = "ISO-8859-1")
filtered_df = id_data_frame[id_data_frame['ID']==question_id]
question_title = filtered_df[['Question']]

print("ID:"+question_id+" Question Title:" + question_title)

options_dict = table_writer.generate_options_dictionary(filtered_df)

#----------------------------------------------------------------
#LOAD DATA FILE AND FILTER NECESSARY COLUMNS
file_name_path = f'{project_path}/{data_folder}/{data_file}'
data_frame = pd.read_csv(file_name_path,encoding = "ISO-8859-1")

my_list = [enrollment_column, gender_column, question_id]

# Get a list of all keys
keys_list = list(options_dict.keys())

selected_columns = my_list + keys_list

# Create a new DataFrame with only the selected columns
data_frame = data_frame[selected_columns]

#RENAMING 
#DataFrame Column with "Options" instead of Question Text
data_frame = data_frame.rename(columns={data_frame.columns[0]: enrollment_column_name})
data_frame = data_frame.rename(columns={data_frame.columns[1]: gender_column_name})
print(data_frame.head())

#Make Program name shorter
#data_frame = data_frame.rename(columns={enrollment_filter: "Enrollment"})
#data_frame["Enrollment"] = data_frame["Program"].replace("University of Potsdam,Another University,Not Enrolled")
#data_frame["Enrollment"] = data_frame["Enrollment"].replace("Other","Others")

#----------------------------------------------------------------
#CREATE LATEX TABLE FILE
tables_path = f'{project_path}/{latex_tables_folder}/{question_id[:2]}/'
tables_file_name = f'{question_id}{sep}{question_title}'

# Set the style for aligning columns to the left
styles = [
    {
        'selector': 'th',
        'props': [('text-align', 'left')]
    },
    {
        'selector': 'td',
        'props': [('text-align', 'left')]
    }
]

# Apply the style to the DataFrame
#df_styled = data_frame.style.set_table_styles(styles)
#display(df_styled)

In [3]:
my_dict = {'a': 1, 'b': 2, 'c': 3}
question_id = 'C001'
enrollment_column = "B003"
gender_column = "B004"

my_list = [enrollment_column, gender_column, question_id]
print(my_list)

# Get a list of all keys
keys_list = list(my_dict.keys())
print(keys_list)

# Concatenate the two lists
concatenated_lists = my_list + keys_list
print(concatenated_lists)

['B003', 'B004', 'C001']
['a', 'b', 'c']
['B003', 'B004', 'C001', 'a', 'b', 'c']


### Answer count and percentages

In [None]:
# ALL:
latex_table = table_writer.display_value_counts(data_frame, question_id, 
                                                question_title, filter_value='All', styles=None)
table_writer.table_to_file(latex_table,tables_path,tables_file_name)

# Filter by Enrollment
program_name_array = data_frame['Enrollment'].unique()
new_program_array= np.setdiff1d(program_name_array, ['Not answered'])

for program_name in new_program_array:
    latex_table = table_writer.display_value_counts(data_frame, question_id, question_title, 
                     filter_column= 'Enrollment', 
                     filter_value= program_name, 
                     styles=None)
    table_writer.table_to_file(latex_table,tables_path,tables_file_name)


### Free text answers

In [None]:
df_text = pd.DataFrame(data_frame[['Comments','Enrollment']]).dropna()
df_text = df_text.sort_values(by=['Enrollment'])
#df_styled = df_text.style.set_table_styles(styles)

table_caption=f'Comments for {question_id}-{question_title}'
label_name = f'{question_id}-FreeText_table'
latex_table = df_text.to_latex(index=False,
                               caption=table_caption, 
                               label=label_name,
                               column_format='@{}p{0.65\\textwidth}p{0.35\\textwidth}')

table_writer.table_to_file(latex_table,tables_path,tables_file_name)


### Making sense of the data

#### Surprising findings:
- Number of recommendations from friends and colleagues (maybe a program to incentivize that?)
- Web pages or search amounts to more than 34%, which is also surprising.
- Low proportion on directed/planned initiatives like conferences and magazines.