# Data Processing

In [1]:
import pandas as pd
import re

Load survey data and select our desired columns

In [92]:
cols = ['Country', 'WebframeHaveWorkedWith', 'WebframeWantToWorkWith']
survey = pd.read_csv('./data-raw/survey_results_public.csv', usecols=cols)
survey.dropna(thresh=2, subset=cols[1:], inplace=True)

# survey.head(5)

Split columns by semicolon delimitation and aggregate

In [93]:
def split_column(value):
	return re.split(r'\s*;\s*', value.strip())

survey['WebframeHaveWorkedWith'] = survey['WebframeHaveWorkedWith'].map(split_column)
survey['WebframeWantToWorkWith'] = survey['WebframeWantToWorkWith'].map(split_column)

have_used = pd.Series(survey.explode('WebframeHaveWorkedWith')['WebframeHaveWorkedWith'].value_counts())
want_use = pd.Series(survey.explode('WebframeWantToWorkWith')['WebframeWantToWorkWith'].value_counts())

Build frameworks dataset

In [94]:
frameworks = pd.DataFrame([have_used, want_use]).transpose()
frameworks.columns = ['have used', 'want to use']
frameworks.reset_index(level=0, names=['framework'], inplace=True)
frameworks.to_csv('../app/data/frameworks.csv')

# frameworks.head(5)

Build countries dataset

In [63]:
def get_country_users(data, col, type):
	subset = data[['Country', col]].explode(col)
	df = pd.DataFrame(subset.value_counts()).reset_index()
	df.columns = ['country', 'framework', 'users']
	df.insert(3, 'type', type)
	return df

have_used_countries = get_country_users(survey, 'WebframeHaveWorkedWith', 'have used')
want_use_countries = get_country_users(survey, 'WebframeWantToWorkWith', 'want to use')
countries = pd.concat([have_used_countries, want_use_countries])
countries.to_csv('../app/data/countries.csv')

Clean tags dataset and add missing columns

In [None]:
tags = pd.read_csv('./data-raw/tags.csv', thousands=',')

tags['answered posts'] = tags['total posts'] - tags['unanswered posts']
tags['solved posts'] = tags['total posts'] - tags['unsolved posts']

tags.to_csv('../app/data/tags.csv')
tags.head(5)