In [32]:
import pandas as pd
from ast import literal_eval

In [33]:
df = pd.read_csv('datavc_survey_results.csv')

In [34]:
df = df.loc[(~df['session'].isnull()) & (~df['position'].isnull()) & (df['mc_empirical'] == 1)]

In [35]:
df['count'] = 1

In [36]:
df.columns

Index(['session', 'created', 'modified', 'ended', 'expired', 'position',
       'mc_empirical', 'mc_group', 'mc_data_type', 'mc_data_size',
       'pre_process', 'rep_process', 'prob_rep', 'tool1', 'tool2',
       'other_tools', 'storage', 'storag2', 'other_store', 'vercon1',
       'vercon2', 'vercon3', 'other_verco', 'dvc', 'feedback', 'count'],
      dtype='object')

In [37]:
def descriptives(df, column):
    pivot = df[[column, 'count']].pivot_table(index=[column], aggfunc=('count'))
    return pivot

In [38]:
descriptives(df, 'position')

Unnamed: 0_level_0,count
position,Unnamed: 1_level_1
PhD student,4
Postdoctoral researcher,12
Research Software Engineer,1
Researcher,1


In [39]:
descriptives(df, 'mc_group')

Unnamed: 0_level_0,count
mc_group,Unnamed: 1_level_1
1.0,6
2.0,8
3.0,1
5.0,3


In [40]:
descriptives(df, 'mc_data_size')

Unnamed: 0_level_0,count
mc_data_size,Unnamed: 1_level_1
1.0,1
2.0,6
4.0,4
5.0,4
6.0,1
7.0,1


In [41]:
descriptives(df, 'mc_data_type')

Unnamed: 0_level_0,count
mc_data_type,Unnamed: 1_level_1
1,1
"1, 2",5
"1, 2, 4, 5, 6",1
2,4
"2, 3",4
"2, 3, 4",1
"2, 4, 5",1


In [42]:
df['mc_data_type'] = df['mc_data_type'].dropna().apply(literal_eval)
df['tool1'] = df['tool1'].dropna().apply(literal_eval)
df['tool2'] = df['tool2'].dropna().apply(literal_eval)
df['storage'] = df['storage'].dropna().apply(literal_eval)
df['storag2'] = df['storag2'].dropna().apply(literal_eval)
df['vercon1'] = df['vercon1'].dropna().apply(literal_eval)
df['vercon2'] = df['vercon2'].dropna().apply(literal_eval)
df['vercon3'] = df['vercon3'].dropna().apply(literal_eval)

In [43]:
tool_list = ['Python',
             'R',
             'Stata',
             'SPSS',
             'SAS',
             'Excel',
             'Word',
             'Nvivo',
             'MAXQDA',
             'Atlas.ti',
             'Other',
            ]

In [44]:
storage_list = ['Google Drive',		
                'OneDrive',		
                'Sharepoint',	
                'Dropbox',		
                'Nextcloud',		
                'AWS',	
                'Google Cloud',		
                'Microsoft Azure',		
                'osf.io',		
                'zenodo',		
                'Own computer',		
                'External hard drive',	
                'University servers',		
                'Other',
               ]

In [54]:
def tool(df, number):
    tool_number = str('tool' + str(number))
    print(tool_number)
    tool = df[[tool_number]].dropna()
    tool = tool.explode(tool_number)
    tool['count'] = 1
    tool = tool.groupby([tool_number]).sum().sort_values(by='count', ascending=False).reset_index()
    tool = tool.replace({tool_number: {1: tool_list[0], 2: tool_list[1], 3: tool_list[2], 4: tool_list[3], 
                                                5: tool_list[4], 6: tool_list[5], 7: tool_list[6], 8: tool_list[7], 9: tool_list[8]}})
    return tool

In [55]:
def storage(df, number):
    storage_number = str('storag' + str(number))
    storage = df[[storage_number]].dropna()
    storage = storage.explode(storage_number)
    storage['count'] = 1
    storage = storage.groupby([storage_number]).sum().sort_values(by='count', ascending=False).reset_index()
    storage = storage.replace({storage_number: {1: storage_list[0], 2: storage_list[1], 3: storage_list[2], 4: storage_list[3], 
                                                5: storage_list[4], 6: storage_list[5], 7: storage_list[6], 8: storage_list[7], 9: storage_list[8],
                                               10: storage_list[9], 11: storage_list[10], 12: storage_list[11], 13: storage_list[12], 14: storage_list[13]}})
    return storage

In [56]:
def vercon(df, number):
    vercon_number = str('vercon' + str(number))
    vercon = df[[vercon_number]].dropna()
    vercon = vercon.explode(vercon_number)
    vercon['count'] = 1
    vercon = vercon.groupby([vercon_number]).sum().sort_values(by='count', ascending=False).reset_index()
    vercon = vercon.replace({vercon_number: {1: 'version control', 2: 'file_names', 3: 'built-in', 4: 'none', 5: 'other'}})
    return vercon

In [57]:
# pre-processing
tool(df, 1)

tool1


Unnamed: 0,tool1,count
0,Python,9
1,R,5
2,Excel,5
3,Stata,4
4,SPSS,1
5,SAS,1
6,Word,1


In [58]:
# analysis
tool(df, 2)

tool2


Unnamed: 0,tool2,count
0,Python,7
1,R,6
2,Stata,4
3,Excel,4
4,Word,1


In [120]:
# preliminary
storage(df, 'e')

Unnamed: 0,storage,count
0,Own computer,11
1,University servers,8
2,Google Drive,5
3,External hard drive,5
4,Dropbox,4
5,Nextcloud,4
6,osf.io,4
7,Other,4
8,OneDrive,1
9,zenodo,1


In [121]:
# final
storage(df, 2)

Unnamed: 0,storag2,count
0,Own computer,9
1,University servers,7
2,Other,7
3,zenodo,4
4,External hard drive,4
5,osf.io,3
6,Google Drive,2
7,Nextcloud,2
8,Dropbox,1


In [140]:
# manuscript
vercon(df, 1)

Unnamed: 0,vercon1,count
0,file_names,10
1,built-in,7
2,version control,4


In [141]:
# analysis
vercon(df, 2)

Unnamed: 0,vercon2,count
0,version control,9
1,file_names,3
2,built-in,3
3,none,2
4,other,1


In [142]:
# data
vercon(df, 3)

Unnamed: 0,vercon3,count
0,file_names,9
1,version control,4
2,built-in,3
3,none,1
4,other,1


In [60]:
df['other_verco']

1                                                    NaN
19                                                   NaN
20                                                   NaN
21                                                   NaN
22                                                   NaN
24                                                   NaN
25                                                   NaN
27                                                   NaN
28                                                   NaN
29                                                   NaN
74                                                   NaN
76                                                   NaN
125                                                  NaN
153                                                  NaN
163                                                  NaN
165    For datasets I use the original datasets. I ne...
167                                                  NaN
187                            