In [1]:
# Import Modules

import json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

### regular read_json method

In [2]:
# Read json into df

df1 = pd.read_json("country.json")
df1.head()

Unnamed: 0,geometry,id,properties,type
0,"{'type': 'Polygon', 'coordinates': [[[61.21081...",AFG,{'name': 'Afghanistan'},Feature
1,"{'type': 'MultiPolygon', 'coordinates': [[[[16...",AGO,{'name': 'Angola'},Feature
2,"{'type': 'Polygon', 'coordinates': [[[20.59024...",ALB,{'name': 'Albania'},Feature
3,"{'type': 'Polygon', 'coordinates': [[[51.57951...",ARE,{'name': 'United Arab Emirates'},Feature
4,"{'type': 'MultiPolygon', 'coordinates': [[[[-6...",ARG,{'name': 'Argentina'},Feature


In [3]:
# Test to see how this df will export back to json (to match original formatting)

df1.to_json('test1.json', orient='records')

### json_normalize method

In [4]:
# open/load json file

with open('country.json') as data_file:  
    data = json.load(data_file)

In [5]:
# use json_normalize to read into df

df = pd.io.json.json_normalize(data)
df.head()

Unnamed: 0,geometry.coordinates,geometry.type,id,properties.name,type
0,"[[[61.210817, 35.650072], [62.230651, 35.27066...",Polygon,AFG,Afghanistan,Feature
1,"[[[[16.326528, -5.87747], [16.57318, -6.622645...",MultiPolygon,AGO,Angola,Feature
2,"[[[20.590247, 41.855404], [20.463175, 41.51508...",Polygon,ALB,Albania,Feature
3,"[[[51.579519, 24.245497], [51.757441, 24.29407...",Polygon,ARE,United Arab Emirates,Feature
4,"[[[[-65.5, -55.2], [-66.45, -55.25], [-66.9599...",MultiPolygon,ARG,Argentina,Feature


In [6]:
# read in wine data

df2 = pd.read_csv('../db/wine_data.csv')
df2.head()

Unnamed: 0,wine_id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,year
0,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011.0
1,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013.0
2,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013.0
3,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012.0
4,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,2011.0


In [7]:
# remove colums that won't aggregate

df2 = df2[['country', 'points', 'price']]
df2.head()

Unnamed: 0,country,points,price
0,Portugal,87,15.0
1,US,87,14.0
2,US,87,13.0
3,US,87,65.0
4,Spain,87,15.0


In [8]:
#create groupby object

sort_df = df2.groupby(['country'])

In [9]:
# agg avg rating

avg_score = sort_df['points'].mean()
avg_score.head()

country
Argentina                 86.760598
Armenia                   87.500000
Australia                 88.596642
Austria                   90.210774
Bosnia and Herzegovina    86.500000
Name: points, dtype: float64

In [10]:
# agg total wines

total_wines = sort_df['country'].count()
total_wines.head()

country
Argentina                 3680
Armenia                      2
Australia                 2204
Austria                   2766
Bosnia and Herzegovina       2
Name: country, dtype: int64

In [11]:
#create new df with this data

df3 = pd.DataFrame({
    "avg_score": avg_score,
    "total_wines": total_wines
})
df3.reset_index(level=0, inplace=True)
df3.head()

Unnamed: 0,country,avg_score,total_wines
0,Argentina,86.760598,3680
1,Armenia,87.5,2
2,Australia,88.596642,2204
3,Austria,90.210774,2766
4,Bosnia and Herzegovina,86.5,2


In [12]:
# merge wine data and json_normalize data
merge_df = df.merge(df3, how = 'inner', left_on = 'properties.name', right_on = 'country')
merge_df.head()

Unnamed: 0,geometry.coordinates,geometry.type,id,properties.name,type,country,avg_score,total_wines
0,"[[[[-65.5, -55.2], [-66.45, -55.25], [-66.9599...",MultiPolygon,ARG,Argentina,Feature,Argentina,86.760598,3680
1,"[[[43.582746, 41.092143], [44.97248, 41.248129...",Polygon,ARM,Armenia,Feature,Armenia,87.5,2
2,"[[[[145.397978, -40.792549], [146.364121, -41....",MultiPolygon,AUS,Australia,Feature,Australia,88.596642,2204
3,"[[[16.979667, 48.123497], [16.903754, 47.71486...",Polygon,AUT,Austria,Feature,Austria,90.210774,2766
4,"[[[22.65715, 44.234923], [22.944832, 43.823785...",Polygon,BGR,Bulgaria,Feature,Bulgaria,87.928058,139


In [13]:
#rename colums to match original formatting

merge_df = merge_df.rename(columns={'avg_score': 'properties.avg_score', 'total_wines': 'properties.total_wines'})

In [14]:
#drop extra country column

merge_df = merge_df.drop(columns=['country'])

In [15]:
# export to json to test for formatting of json

merge_df.to_json('test2.json', orient='table')