In [15]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [2]:
proposals = pd.read_pickle('proposals.pkl')
themes = pd.read_pickle('themes.pkl')

In [3]:
region_order = [
    'North East',
    'North West',
    'Yorkshire and The Humber',
    'East Midlands',
    'West Midlands',
    'East of England',
    'London',
    'South East',
    'South West',
    'England',
    'Scotland',
    'Wales',
    'Northern Ireland',
    'UK wide',
    'Overseas',
    'UK and overseas',
]

In [26]:
# Get the most common words for a dataset
def most_common_words(df, n=10):
    text = df[
        ['title', 'description']
    ].fillna("")\
        .apply(" ".join, axis=1)\
        .str.lower()\
        .map(lambda x: re.sub(' +', ' ', re.sub('[^ a-z0-9]', ' ', x)))
    count_vectorizer = CountVectorizer(stop_words='english')
    count_data = count_vectorizer.fit_transform(text)
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:n]
    return dict(count_dict)

In [27]:
def create_profile(p, t):
    areas = p['area'].value_counts().to_dict()
    return {
        'recipient_category': p['recipient_category'].value_counts().to_dict(),
        'recipient_income_band': p['recipient_income_band'].value_counts().sort_index().to_dict(),
        'recipient_operating_for': p['recipient_operating_for'].value_counts().sort_index().to_dict(),
        'geographic_scale': p['geographic_scale'].value_counts().to_dict(),
        'category': p['category'].value_counts().to_dict(),
        'amount_stats': p['min_amount'].describe().to_dict(),
        'amount_bins': p['amount_bins'].value_counts().sort_index().to_dict(),
        'duration_bins': p['duration_bins'].value_counts().sort_index().to_dict(),
        'area': {
            r: areas.get(r, 0)
            for r in region_order
        },
        'themes': t[t['proposal_id'].isin(p.index)]['parent_name'].value_counts().to_dict(),
        'examples': p.loc[
            (p.created_at.dt.year >= 2018) & (p.recipient_category != "An individual"),
            ["title", "description", 'geographic_scale', 
            'recipient_operating_for', 'recipient_income_band', 
            'area', 'category', 'recipient_category', 
            'amount_bins', 'duration_bins']
        ].sample(15).to_dict(orient='records'),
        'word_counts': most_common_words(p, n=30)
    }

In [28]:
t = themes[themes['parent_name']!="Groups"]
results = {
    "area": {},
    "theme": {},
}

# full dataset
results['all'] = create_profile(proposals, t)

# by area
for k in proposals.area.dropna().unique():
    print(k)
    results['area'][k] = create_profile(proposals[proposals.area==k], t)

# by theme
for k in t['parent_name'].unique():
    print(k)
    p = proposals.loc[themes.loc[themes['parent_name']==k, 'proposal_id'], :]
    results['theme'][k] = create_profile(p, t)

# add existing examples
with open('docs/results.json', 'r') as b:
    existing_results = json.load(b)
    results['all']['examples'] = existing_results['all']['examples']
    for k in ['area', 'theme']:
        for p in results[k]:
            if existing_results[k].get(p, {}).get('examples'):
                results[k][p]['examples'] = existing_results[k][p]['examples']
    
with open('docs/results.json', 'w') as a:
    json.dump(results, a, indent=4)

East of England
Scotland
South West
West Midlands
North West
Overseas
North East
England
South East
UK and overseas
East Midlands
Wales
Yorkshire and The Humber
London
UK wide
Northern Ireland
Education and training
Public and societal benefit


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


Health and medicine
Social welfare
Crime and justice
Employment
Climate and the environment
Community improvement and capacity building
International and foreign affairs
Arts and recreation
Science and technology


In [33]:
proposals.created_at.dt.year.value_counts().sort_index()

2015       45
2016     4400
2017    10225
2018     6174
2019     1982
2020      530
Name: created_at, dtype: int64