# Load Pickles Files

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import pickle

### Merge crop and trades DataFrame

In [3]:
def double_pivot(raw_df):
    '''
    Function that pivots the table twice to creating five new columns based on two columns.
    '''
    # pivot column 'element crops'
    df = raw_df.pivot_table(values=['value_crops'],\
                                  index=['area_code', 'area_crops', 'item_code_crops', 'item_crops', 'year', \
                                         'item_trades', 'element_trades', 'value_trades'],\
                                  columns=['element_crops']).reset_index()
    
    # deal with index to flatten
    df.columns = [' '.join(col).strip() for col in df.columns.values]

    # rename columns according to our nomenclature
    df.rename(columns= {'value_crops Area harvested': 'area_harvested', 'value_crops Production': 'production',\
                   'value_crops Yield':'yield'}, inplace=True)
    
    # pivot column 'element trades'
    df = df.pivot_table(values=['value_trades'],\
                        index=['area_code', 'area_crops', 'item_code_crops', 'item_crops', 'year', \
                               'item_trades', 'area_harvested', 'production', 'yield'],\
                        columns=['element_trades']).reset_index()
        
    # deal with index to flatten
    df.columns = [' '.join(col).strip() for col in df.columns.values]
    
    # rename columns according to our nomenclature
    df.rename(columns= {'value_trades Export Quantity': 'export_q', 'value_trades Export Value': 'export_v',\
                   'value_trades Import Quantity':'import_q', 'value_trades Import Value': 'import_v'}, inplace=True)
    
    return df

### Group crops

In [4]:
def load_group_crops_df():
    #Quickly load the data : 
    group_crops_df = pd.read_csv('data/GroupCrops.csv')
    #Column names to lower case
    group_crops_df.columns = map(lambda name : name.lower().replace(' ', '_'), group_crops_df.columns) 

    # Load the UN Comtrade Commodity Classifications :
    UNCCC_df = pd.read_excel('data/UNCCC.xlsx')
    # Keep the classification we are interested in (H4)
    HS12_df = UNCCC_df[UNCCC_df.Classification=='H4']
    
    # remove Nan values
    group_crops_cleaned_df = group_crops_df[~group_crops_df['hs12_code'].isnull()]
    
    # If the items is assigned mutliple labels, we only keep the first one
    HS12_Code = group_crops_cleaned_df['hs12_code'].str.split(', ',expand=True).loc[:,0]
    
    group_crops_cleaned_df = pd\
                            .concat([group_crops_cleaned_df,HS12_Code],axis=1)\
                            .drop(['factor','hs_code','hs07_code','cpc_code','hs12_code', \
                                   'item_group_code', 'item_group', 'item'],axis=1)
    
    group_crops_cleaned_df.rename(columns={0:'hs12_code'},inplace=True)
    
    # Add parent and child codes
    group_crops_cleaned_df['parent_group'] = group_crops_cleaned_df['hs12_code'].str[:2]#.astype(int)
    group_crops_cleaned_df['child_group'] = group_crops_cleaned_df['hs12_code'].str[:4]#.astype(int)
    
    # Add parent description
    group_crops_cleaned_df = group_crops_cleaned_df\
                        .merge(HS12_df[['Code', 'Description']], how='inner', left_on='parent_group', right_on='Code')\
                        .drop(['Code'], axis=1)

    group_crops_cleaned_df.rename(columns={'Description':'parent_description'}, inplace=True)
    
    # Add child description
    group_crops_cleaned_df = group_crops_cleaned_df\
                        .merge(HS12_df[['Code', 'Description']], how='inner', left_on='child_group', right_on='Code')\
                        .drop(['Code'], axis=1)

    group_crops_cleaned_df.rename(columns={'Description':'child_description'}, inplace=True)
    
    # drop duplicates
    group_crops_cleaned_df.drop_duplicates(inplace=True)
    
    return group_crops_cleaned_df

In [5]:
def final_df() :
    
    unpickled_df = pd.read_pickle("data/big_ass_df.pickle")
    
    # clean unpickled_df
    unpickled_df = unpickled_df.drop(['element_code_crops','flag','area_trades','element_code_trades'],axis=1)
    
    # reorganize columns by pivoting twice
    items_df = double_pivot(unpickled_df)
    
    # load group crops df
    groups_df = load_group_crops_df()
    
    # merge both 
    df_final = items_df.merge(groups_df, how='left', left_on='item_code_crops',right_on='item_code')\
                        .drop(['item_code_crops', 'item_code'], axis=1)
    
    return df_final

In [6]:
final_df = final_df()

In [275]:
# All items which don't have any child or parents matching (as we have done  a left join)
final_df[final_df.child_group.isnull()].item_trades.unique()

array(['Cigars, cheroots', 'Vegetables, preserved nes',
       'Vegetables in vinegar', 'Vegetables, dehydrated',
       'Vegetables, frozen', 'Vegetables, temporarily preserved',
       'Oil, vegetable origin nes',
       'Vegetables, homogenized preparations',
       'Vegetables, preserved, frozen', 'Waxes vegetable',
       'Feed, vegetable products nes',
       'Leeks, other alliaceous vegetables', 'Cereals, breakfast',
       'Flour, cereals', 'Juice, citrus, concentrated',
       'Juice, citrus, single strength', 'Flax fibre and tow',
       'Flax fibre raw', 'Oil, boiled etc', 'Oil, essential nes',
       'Oils, fats of animal nes', 'Oil, citronella',
       'Hair, goat, coarse', 'Pyrethrum, dried', 'Pyrethrum, extraction',
       'Peppermint'], dtype=object)

In [11]:
final_df[final_df.parent_group=='10'].child_description.unique()

array(['Wheat and meslin', 'Barley', 'Maize (corn)', 'Rye', 'Oats',
       'Rice', 'Buckwheat, millet and canary seeds; other cereals',
       'Grain sorghum'], dtype=object)

In [13]:
final_df.to_pickle('data/final_df.pkl')

### Country groups

In [92]:
# Load the dataset
df_country = pd.read_csv('Data/GroupsCountry.csv')
df_country.columns = map(lambda name : name.lower().replace(' ', '_'), df_country.columns) #Column names to lower case
df_country.head()

Unnamed: 0,country_group_code,country_group,country_code,country,m49_code,iso2_code,iso3_code
0,5100,Africa,4,Algeria,12,DZ,DZA
1,5100,Africa,7,Angola,24,AO,AGO
2,5100,Africa,53,Benin,204,BJ,BEN
3,5100,Africa,20,Botswana,72,BW,BWA
4,5100,Africa,233,Burkina Faso,854,BF,BFA


In [93]:
# Create a dictionnary that regroup all the countries in one country group
dicts = {}
list_group = df_country['country_group_code'].unique()
keys = list_group
for i in range(len(list_group)):
    dicts[list_group[i]]= df_country[df_country['country_group_code']==list_group[i]]['country_code'].unique()

In [94]:
# Create dictionnary for name + subregions
dicts_country = {}
keys = list_group
for i in range(len(list_group)):
    dicts_country[list_group[i]] = df_country['country_group'].unique()[i]

In [95]:
# Display all the suregions and their name
pd.DataFrame.from_dict(dicts_country,orient='index',columns=['regions']).head(15)

Unnamed: 0,regions
5100,Africa
5200,Americas
5300,Asia
5501,Australia and New Zealand
5206,Caribbean
5204,Central America
5301,Central Asia
5101,Eastern Africa
5302,Eastern Asia
5401,Eastern Europe


In [None]:
def badassfunction(df_import_export,group_country=None,country=None,group_crops=None,crops=None,year=None,by_crop=False,by_country=False):
    """
    From the original dataset, this function will processed the dataset 
    to return a summary of the desired crops, country and year
    INPUTS : 
    - df_import_export
    - group_country
    - country
    - group_crops
    - year
    OUTPUT : 
    - cleaned df
    """
    df_cleaned = df_import_export
    # Extract country
    if group_country != None :
        df_cleaned = df_cleaned[df_cleaned['area_code'].isin(dicts.get(group_country))]
    if country != None:
        df_cleaned = df_cleaned[df_cleaned['country_code']==country]
    if group_crops != None:
        df_cleaned = df_cleaned[df_cleaned['parent_group']==group_crops]
    if crops != None:
        df_cleaned = df_cleaned[df_cleaned['child_group']==crops]
    if year != None: 
        df_cleaned = df_cleaned[df_cleaned['year']==year]
     
    # Add new features
    df_cleaned['diff_quantity'] = df_cleaned['export_quantity'] - df_cleaned['import_quantity']
    df_cleaned['profit'] = df_cleaned['export_value'] - df_cleaned['import_value']

    # If feature is added above then add it name in that list
    list_features = ['export_quantity','export_value','import_quantity','import_value','diff_quantity','profit']
    #Group by item
    if by_crop:
        df_sum = df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].sum().reset_index()

        df_max = df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].max().reset_index()   
        df_max_index = df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].idxmax().reset_index()

        df_min = df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].min().reset_index()
        df_min_index =  df_cleaned.groupby(['item_x','parent_group','child_group'])[list_features].idxmin().reset_index()
    
        merge_item = 'item_x'
        replace_item = 'area'

    if by_country:
        df_sum = df_cleaned.groupby(['area','area_code'])[list_features].sum().reset_index()

        df_max = df_cleaned.groupby(['area','area_code'])[list_features].max().reset_index()   
        df_max_index = df_cleaned.groupby(['area','area_code'])[list_features].idxmax().reset_index()

        df_min = df_cleaned.groupby(['area','area_code'])[list_features].min().reset_index()
        df_min_index =  df_cleaned.groupby(['area','area_code'])[list_features].idxmin().reset_index()
    
        merge_item = 'area'
        replace_item = 'item_x'
    
    for i in range(len(list_features)):
        df_max_index.loc[:,list_features[i]] = df_import_export.loc[df_max_index.loc[:,list_features[i]],replace_item].values
    for i in range(len(list_features)):
        df_min_index.loc[:,list_features[i]] = df_import_export.loc[df_min_index.loc[:,list_features[i]],replace_item].values
    
    # merge max 
    df_max_merged = pd.merge(df_max.loc[:,[merge_item]+list_features],df_max_index.loc[:,[merge_item]+list_features],left_on=merge_item,right_on=merge_item,how='inner',suffixes=('_max','_max_names'))
    # merge min 
    df_min_merged = pd.merge(df_min.loc[:,[merge_item]+list_features],df_min_index.loc[:,[merge_item]+list_features],left_on=merge_item,right_on=merge_item,how='inner',suffixes=('_min','_min_names'))

    # merge total
    df_merged_tot = pd.merge(df_min_merged,df_max_merged, left_on=merge_item,right_on=merge_item)
    
    df_tot = pd.merge(df_sum, df_merged_tot, left_on=merge_item,right_on=merge_item)
    return df_tot