id: C001

In [1]:
import pandas as pd
import numpy as np
import configparser
import module_table_writer as table_writer
import module_data_wrangling as dw

question_id = "C001"
#question_title='How were you made aware of the opportunity to do a PhD at the HPI?'
enrolled_column = "B003"
gender_column = "B004"
enrolled_column_name = "Enrolled"
gender_column_name = "Gender"

#-----------------------------------------------------------------
# LOAD CONFIGURATION FILE

config = configparser.ConfigParser()
config.read('config.ini')
project_path = config['file.loading']['project_path']
data_folder = config['file.loading']['data_folder']
latex_tables_folder = config['file.loading']['latex_tables_folder']
question_index_file = config['file.loading']['question_index_file']
data_file = config['file.loading']['data_file']
sep = config['file.loading']['sep']

#----------------------------------------------------------------
#LOAD QUESTION INDEX (Questions id, Question text, Answer alternatives, Details)
question_index_path = f'{project_path}/{data_folder}/{question_index_file}'
#print(question_index_path)
id_data_frame = pd.read_csv(question_index_path,encoding = "ISO-8859-1")
filtered_df = id_data_frame[id_data_frame['ID']==question_id]
question_title = filtered_df['Question'].iloc[0]

print("ID:"+question_id+" Question Title:" + question_title)

options_dict = dw.generate_options_dictionary(filtered_df)

#----------------------------------------------------------------
#LOAD DATA FILE AND FILTER NECESSARY COLUMNS
file_name_path = f'{project_path}/{data_folder}/{data_file}'
data_frame = pd.read_csv(file_name_path,encoding = "ISO-8859-1")

my_list = [enrolled_column, gender_column, question_id]

# Get a list of all keys
options_columns_list = list(options_dict.keys())
options_names_list = list(options_dict.values())

selected_columns = my_list + options_columns_list

# Create a new DataFrame with only the selected columns
data_frame = data_frame[selected_columns]

#RENAMING 
#DataFrame Column with "Options" instead of Question Text
data_frame = data_frame.rename(columns={data_frame.columns[0]: enrolled_column_name})
data_frame = data_frame.rename(columns={data_frame.columns[1]: gender_column_name})

#Make Program name shorter
#data_frame = data_frame.rename(columns={enrolled_filter: "enrolled"})
#data_frame["enrolled"] = data_frame["Program"].replace("University of Potsdam,Another University,Not Enrolled")
#data_frame["enrolled"] = data_frame["enrolled"].replace("Other","Others")

#----------------------------------------------------------------
#CREATE LATEX TABLE FILE
tables_path = f'{project_path}/{latex_tables_folder}/{question_id[:2]}/'
tables_file_name = f'{question_id}'

# Set the style for aligning columns to the left
styles = [
    {
        'selector': 'th',
        'props': [('text-align', 'left')]
    },
    {
        'selector': 'td',
        'props': [('text-align', 'left')]
    }
]

# Apply the style to the DataFrame
#df_styled = data_frame.style.set_table_styles(styles)
#display(df_styled)

ID:C001 Question Title:How were you made aware of the opportunity to do a PhD at the HPI?
{'C001_01': 'Web', 'C001_02': 'Newsletters', 'C001_03': 'Academic conferences', 'C001_04': 'Friends/colleagues', 'C001_05': 'I completed my Bachelor/Master at HPI', 'C001_06': 'Other'}
   Enrolled  Gender  C001  C001_01  C001_02  C001_03  C001_04  C001_05  \
0         0       2   1.0      2.0      1.0      1.0      1.0      1.0   
1         0       1   1.0      1.0      1.0      1.0      2.0      1.0   
2         0       2   1.0      1.0      1.0      1.0      2.0      1.0   
3         0       2   1.0      2.0      1.0      1.0      1.0      1.0   
4         0       2   1.0      1.0      1.0      1.0      2.0      1.0   

   C001_06  
0      1.0  
1      1.0  
2      1.0  
3      1.0  
4      1.0  
C:/Users/Christian/Documents/GitHub/survey_analysis/latex_tables/C0/
C001


### Enrollment count and percentages

In [2]:
#ALL
count_df = dw.percentage_options(df_data=data_frame,
                                options_columns=options_columns_list,
                                options_names=options_names_list,
                                selected_code=2)
display(count_df)
latex_table = table_writer.write_latex_table(count_df,False, "All", 
                                    question_id, question_title,
                                    column_format="@{}lcc")
table_writer.table_to_file(latex_table,tables_path,tables_file_name)

# Filter by enrolled
enrollment_option_array = data_frame[enrolled_column_name].unique()
print(enrollment_option_array)
#new_array= np.setdiff1d(enrollment_option_array, ['Not answered'])

filter_column = enrolled_column_name
print(data_frame.head())
print(filter_column)

"""for enrollment_option in enrollment_option_array:
    filtered_df = data_frame[data_frame[filter_column] == enrollment_option]    
    print(filtered_df.head())
    
    count_df = dw.percentage_options(df_data = filtered_df,
                                options_columns=options_columns_list,
                                options_names=options_names_list,
                                selected_code=2)
    display(count_df)

    latex_table = table_writer.write_latex_table(count_df,False, enrollment_option, 
                                    question_id, question_title,column_format="@{}lcc")
    table_writer.table_to_file(latex_table,tables_path,tables_file_name)
"""

file_nameC:/Users/Christian/Documents/GitHub/survey_analysis/latex_tables/C0/C001_tables.tex
[0 1 2]
   Enrolled  Gender  C001  C001_01  C001_02  C001_03  C001_04  C001_05  \
0         0       2   1.0      2.0      1.0      1.0      1.0      1.0   
1         0       1   1.0      1.0      1.0      1.0      2.0      1.0   
2         0       2   1.0      1.0      1.0      1.0      2.0      1.0   
3         0       2   1.0      2.0      1.0      1.0      1.0      1.0   
4         0       2   1.0      1.0      1.0      1.0      2.0      1.0   

   C001_06  
0      1.0  
1      1.0  
2      1.0  
3      1.0  
4      1.0  
Enrolled


'for enrollment_option in enrollment_option_array:\n    filtered_df = data_frame[data_frame[filter_column] == enrollment_option]    \n    print(filtered_df.head())\n    \n    count_df = dw.percentage_options(df_data = filtered_df,\n                                options_columns=options_columns_list,\n                                options_names=options_names_list,\n                                selected_code=2)\n    display(count_df)\n\n    latex_table = table_writer.write_latex_table(count_df,False, enrollment_option, \n                                    question_id, question_title,column_format="@{}lcc")\n    table_writer.table_to_file(latex_table,tables_path,tables_file_name)\n'

### Gender Count and percentages

In [None]:
# ALL:
latex_table = table_writer.display_value_counts(data_frame, question_id, 
                                                question_title, filter_value='All', styles=None)
table_writer.table_to_file(latex_table,tables_path,tables_file_name)

# Filter by gender
gender_selection_array = data_frame[gender_column_name].unique()
new_array= np.setdiff1d(gender_selection_array, ['Not answered'])

for gender_selection in new_array:
    latex_table = table_writer.display_value_counts(data_frame, question_id, question_title, 
                     filter_column= gender_column_name, 
                     filter_value= gender_selection, 
                     styles=None)
    table_writer.table_to_file(latex_table,tables_path,tables_file_name)

### Free text answers

In [None]:
df_text = pd.DataFrame(data_frame[['Comments','enrolled']]).dropna()
df_text = df_text.sort_values(by=['enrolled'])
#df_styled = df_text.style.set_table_styles(styles)

table_caption=f'Comments for {question_id}-{question_title}'
label_name = f'{question_id}-FreeText_table'
latex_table = df_text.to_latex(index=False,
                               caption=table_caption, 
                               label=label_name,
                               column_format='@{}p{0.65\\textwidth}p{0.35\\textwidth}')

table_writer.table_to_file(latex_table,tables_path,tables_file_name)


### Making sense of the data

#### Surprising findings:
- Number of recommendations from friends and colleagues (maybe a program to incentivize that?)
- Web pages or search amounts to more than 34%, which is also surprising.
- Low proportion on directed/planned initiatives like conferences and magazines.