# Load Pickles Files

In [7]:
import pandas as pd
import matplotlib.pyplot as plt

In [8]:
import pickle

### Merge crop and trades DataFrame

In [10]:
unpickled_df = pd.read_pickle("data/big_ass_df.pickle")

In [91]:
unpickled_df.head(3)

Unnamed: 0,area_code,area_crops,item_code_crops,item_crops,element_crops,year,unit_crops,value_crops,item_key,item_code_trades,item_trades,element_trades,unit_trades,value_trades
0,2,Afghanistan,221,"Almonds, with shell",Area harvested,1975,ha,0.0,almond,231,Almonds shelled,Export Quantity,tonnes,0.0
1,2,Afghanistan,221,"Almonds, with shell",Area harvested,1975,ha,0.0,almond,231,Almonds shelled,Export Value,1000 US$,0.0
2,2,Afghanistan,221,"Almonds, with shell",Production,1975,tonnes,0.0,almond,231,Almonds shelled,Export Quantity,tonnes,0.0


In [12]:
# clean unpickled_df
unpickled_df = unpickled_df.drop(['element_code_crops','flag','area_trades','element_code_trades'],axis=1)

In [90]:
def double_pivot(raw_df):
    '''
    Function that pivots the table twice to creating five new columns based on two columns.
    '''
    # pivot column 'element crops'
    df = raw_df.pivot_table(values=['value_crops'],\
                                  index=['area_code', 'area_crops', 'item_code_crops', 'item_crops', 'year', \
                                         'item_trades', 'element_trades', 'value_trades'],\
                                  columns=['element_crops']).reset_index()
    
    # deal with index to flatten
    df.columns = [' '.join(col).strip() for col in df.columns.values]

    # rename columns according to our nomenclature
    df.rename(columns= {'value_crops Area harvested': 'area_harvested', 'value_crops Production': 'production',\
                   'value_crops Yield':'yield'}, inplace=True)
    
    # pivot column 'element trades'
    df = df.pivot_table(values=['value_trades'],\
                        index=['area_code', 'area_crops', 'item_code_crops', 'item_crops', 'year', \
                               'item_trades', 'area_harvested', 'production', 'yield'],\
                        columns=['element_trades']).reset_index()
        
    # deal with index to flatten
    df.columns = [' '.join(col).strip() for col in df.columns.values]
    
    # rename columns according to our nomenclature
    df.rename(columns= {'value_trades Export Quantity': 'export_q', 'value_trades Export Value': 'export_v',\
                   'value_trades Import Quantity':'import_q', 'value_trades Import Value': 'import_v'}, inplace=True)
    
    return df

In [88]:
our_df = double_pivot(unpickled_df)

In [89]:
our_df.head()

Unnamed: 0,area_code,area_crops,item_code_crops,item_crops,year,item_trades,area_harvested,production,yield,export_q,export_v,import_q,import_v
0,1,Armenia,15,Wheat,1992,Wheat,65500.0,141483.0,21600.0,,,400000.0,60000.0
1,1,Armenia,15,Wheat,1993,Wheat,97900.0,217900.0,22257.0,,,408000.0,59000.0
2,1,Armenia,15,Wheat,1993,"Flour, wheat",97900.0,217900.0,22257.0,,,46000.0,9400.0
3,1,Armenia,15,Wheat,1994,Wheat,85697.0,152900.0,17842.0,,,327000.0,52000.0
4,1,Armenia,15,Wheat,1994,"Flour, wheat",85697.0,152900.0,17842.0,,,55000.0,14700.0


### Group by countries

In [None]:
# Load the dataset
df_country = pd.read_csv('Data/GroupsCountry.csv')
df_country.columns = map(lambda name : name.lower().replace(' ', '_'), df_country.columns) #Column names to lower case
df_country.head()

In [10]:
# Create a dictionnary that regroup all the countries in one country group
dicts = {}
list_group = df_country['country_group_code'].unique()
keys = list_group
for i in range(len(list_group)):
    dicts[list_group[i]]= df_country[df_country['country_group_code']==list_group[i]]['country_code'].unique()

In [11]:
# Create dictionnary for name + subregions
dicts_country = {}
keys = list_group
for i in range(len(list_group)):
    dicts_country[list_group[i]] = df_country['country_group'].unique()[i]

In [12]:
# Display all the suregions and their name
pd.DataFrame.from_dict(dicts_country,orient='index',columns=['regions']).head(15)

Unnamed: 0,regions
5100,Africa
5200,Americas
5300,Asia
5501,Australia and New Zealand
5206,Caribbean
5204,Central America
5301,Central Asia
5101,Eastern Africa
5302,Eastern Asia
5401,Eastern Europe


### Group by crops

In [13]:
#Quickly load the data : 
group_crops_df = pd.read_csv('data/GroupCrops.csv')
group_crops_df.columns = map(lambda name : name.lower().replace(' ', '_'), group_crops_df.columns) #Column names to lower case
group_crops_df.shape

(2415, 9)

In [14]:
# Load the UN Comtrade Commodity Classifications :
UNCCC_df = pd.read_excel('data/UNCCC.xlsx')

In [15]:
HS12_df = UNCCC_df[UNCCC_df.Classification=='H4']

In [16]:
# df group crops with the nan values removed
group_crops_cleaned_df = group_crops_df[~group_crops_df['hs12_code'].isnull()]

In [17]:
# If the items is assigned mutliple labels, we only keep the first one
HS12_Code = group_crops_cleaned_df['hs12_code'].str.split(', ',expand=True).loc[:,0]

In [18]:
group_crops_cleaned_df = pd\
                            .concat([group_crops_cleaned_df,HS12_Code],axis=1)\
                            .drop(['factor','hs_code','hs07_code','cpc_code','hs12_code'],axis=1)

In [19]:
group_crops_cleaned_df.rename(columns={0:'hs12_code'},inplace=True)

In [20]:
group_crops_cleaned_df['parent_group'] = group_crops_cleaned_df['hs12_code'].str[:2].astype(int)
group_crops_cleaned_df['child_group'] = group_crops_cleaned_df['hs12_code'].str[:4].astype(int)

In [21]:
# Final dataset used for the mapping of each crop : 
group_crops_cleaned_df = group_crops_cleaned_df\
                            .drop(['item_group_code','item_group'],axis=1)\
                            .groupby(['item','item_code','hs12_code','parent_group','child_group'])\
                            .count()\
                            .reset_index()

In [22]:
group_crops_cleaned_df

Unnamed: 0,item,item_code,hs12_code,parent_group,child_group
0,Agave fibres nes,800,530500,53,5305
1,Alfalfa meal and pellets,862,121410,12,1214
2,Almonds shelled,231,080212,8,802
3,"Almonds, with shell",221,080211,8,802
4,Animals live nes,1171,010611,1,106
...,...,...,...,...,...
558,"Wool, shoddy",1007,510400,51,5104
559,Yams,137,071430,7,714
560,Yautia (cocoyam),135,071450,7,714
561,Yoghurt,891,040310,4,403


In [None]:
# Merging on item_code only for the trade_df at the moment
df_final = trades_df.merge(group_crops_cleaned_df, left_on='item_code',right_on='item_code')
df_final.head()

In [None]:
def badassfunction(df_import_export,group_country=None,country=None,group_crops=None,crops=None,year=None,by_crop=False,by_country=False):
    """
    From the original dataset, this function will processed the dataset 
    to return a summary of the desired crops, country and year
    INPUTS : 
    - df_import_export
    - group_country
    - country
    - group_crops
    - year
    OUTPUT : 
    - cleaned df
    """
    df_cleaned = df_import_export
    # Extract country
    if group_country != None :
        df_cleaned = df_cleaned[df_cleaned['area_code'].isin(dicts.get(group_country))]
    if country != None:
        df_cleaned = df_cleaned[df_cleaned['country_code']==country]
    if group_crops != None:
        df_cleaned = df_cleaned[df_cleaned['parent_group']==group_crops]
    if crops != None:
        df_cleaned = df_cleaned[df_cleaned['child_group']==crops]
    if year != None: 
        df_cleaned = df_cleaned[df_cleaned['year']==year]
     
    # Add new features
    df_cleaned['diff_quantity'] = df_cleaned['export_quantity'] - df_cleaned['import_quantity']
    df_cleaned['profit'] = df_cleaned['export_value'] - df_cleaned['import_value']

    # If feature is added above then add it name in that list
    list_features = ['export_quantity','export_value','import_quantity','import_value','diff_quantity','profit']
    #Group by item
    if by_crop:
        df_sum = df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].sum().reset_index()

        df_max = df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].max().reset_index()   
        df_max_index = df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].idxmax().reset_index()

        df_min = df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].min().reset_index()
        df_min_index =  df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].idxmin().reset_index()
    
        merge_item = 'item_x'
        replace_item = 'area'

    if by_country:
        df_sum = df_cleaned.groupby(['area','area_code'])[list_features].sum().reset_index()

        df_max = df_cleaned.groupby(['area','area_code'])[list_features].max().reset_index()   
        df_max_index = df_cleaned.groupby(['area','area_code'])[list_features].idxmax().reset_index()

        df_min = df_cleaned.groupby(['area','area_code'])[list_features].min().reset_index()
        df_min_index =  df_cleaned.groupby(['area','area_code'])[list_features].idxmin().reset_index()
    
        merge_item = 'area'
        replace_item = 'item_x'
    
    for i in range(len(list_features)):
        df_max_index.loc[:,list_features[i]] = df_import_export.loc[df_max_index.loc[:,list_features[i]],replace_item].values
    for i in range(len(list_features)):
        df_min_index.loc[:,list_features[i]] = df_import_export.loc[df_min_index.loc[:,list_features[i]],replace_item].values
    
    # merge max 
    df_max_merged = pd.merge(df_max.loc[:,[merge_item]+list_features],df_max_index.loc[:,[merge_item]+list_features],left_on=merge_item,right_on=merge_item,how='inner',suffixes=('_max','_max_names'))
    # merge min 
    df_min_merged = pd.merge(df_min.loc[:,[merge_item]+list_features],df_min_index.loc[:,[merge_item]+list_features],left_on=merge_item,right_on=merge_item,how='inner',suffixes=('_min','_min_names'))

    # merge total
    df_merged_tot = pd.merge(df_min_merged,df_max_merged, left_on=merge_item,right_on=merge_item)
    
    df_tot = pd.merge(df_sum, df_merged_tot, left_on=merge_item,right_on=merge_item)
    return df_tot