In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
DATA_PATH = '/data/'

#### Read Data

Get data from the grain source and the land matrix source, clean it and merge it.

In [3]:
df_2k12 = pd.read_excel(DATA_PATH+"grain_2016_v2.xlsx", sheet_name='2012')
df_2k16 = pd.read_excel(DATA_PATH+"grain_2016_v2.xlsx", sheet_name='2016')
df_all = pd.read_excel(DATA_PATH+"land_matrix_all.xls")
df = pd.concat([df_2k12, df_2k16])
df['label'] = 1
df = df.drop(['projected_investment'], axis=1)
df = df[['transaction_id', 'target_country', 'size_of_land', 'crop', 'label']]
df

Unnamed: 0,transaction_id,target_country,size_of_land,crop,label
0,,Brazil,424000.0,"Soybeans, sugar cane",1
1,,Argentina,320000.0,"Maize, soybeans, wheat",1
2,,Australia,2500.0,Sugar cane,1
3,,Sierra Leone,30000.0,Rice,1
4,,Liberia,220000.0,Oil palm,1
5,,Ethiopia,10000.0,"Cereals, pulses, rice",1
6,,East Timor,100000.0,Sugar cane,1
7,,Papua New Guinea,33000.0,Cassava,1
8,,Sierra Leone,12000.0,Oil palm,1
9,,Liberia,169000.0,Oil palm,1


In [4]:
df_land_matrix = df_all[['target_country', 'size_of_land', 'crop']]

nan_idx = np.where(df_land_matrix['size_of_land'].isnull())

for idx in nan_idx:
    intended_size = df_all.loc[idx, 'intended_size']
    production_size = df_all.loc[idx, 'production_size']
    
    if intended_size is not '':
        df_land_matrix.loc[idx, 'size_of_land'] = intended_size
    elif intended_size is not 'nan':
        df_land_matrix.loc[idx, 'size_of_land'] = intended_size
    elif intended_size is not '0':
        df_land_matrix.loc[idx, 'size_of_land'] = intended_size
    elif intended_size is not 0:
        df_land_matrix.loc[idx, 'size_of_land'] = intended_size
    else:
        df_land_matrix.loc[idx, 'size_of_land'] = production_size

df_land_matrix['label'] = 0
df_land_matrix['transaction_id'] = 0
df_land_matrix = df_land_matrix[['transaction_id', 'target_country', 'size_of_land', 'crop', 'label']]
df_land_matrix


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,transaction_id,target_country,size_of_land,crop,label
0,0,Myanmar,20234.0,"Onion, Tea, Corn (Maize), Rice, Soya Beans, Su...",0
1,0,Bangladesh,5000.0,,0
2,0,Cambodia,9380.0,,0
3,0,Cambodia,7000.0,"Accacia, Rubber",0
4,0,Cambodia,7000.0,"Accacia, Rubber",0
5,0,Cambodia,25000.0,Rice,0
6,0,Cambodia,9863.0,Accacia,0
7,0,Cambodia,6523.0,Sugar Cane,0
8,0,Cambodia,10000.0,Teak,0
9,0,Cambodia,10603.0,"Cashew, Rubber",0


In [5]:
df_merged = pd.concat([df, df_land_matrix])
df_merged

Unnamed: 0,transaction_id,target_country,size_of_land,crop,label
0,,Brazil,424000.0,"Soybeans, sugar cane",1
1,,Argentina,320000.0,"Maize, soybeans, wheat",1
2,,Australia,2500.0,Sugar cane,1
3,,Sierra Leone,30000.0,Rice,1
4,,Liberia,220000.0,Oil palm,1
5,,Ethiopia,10000.0,"Cereals, pulses, rice",1
6,,East Timor,100000.0,Sugar cane,1
7,,Papua New Guinea,33000.0,Cassava,1
8,,Sierra Leone,12000.0,Oil palm,1
9,,Liberia,169000.0,Oil palm,1


#### Vectorize the crop column and integer encode the countries column

In [6]:
product_list = list(df_merged.crop.unique())
cleaned_list  = [x for x in product_list if str(x) != 'nan']
uniq_list = []
for item in cleaned_list:
    if ',' in item:
        temp = item.split()
        lower  = [x.lower().replace(',', '').replace('(', '').replace(')', '').strip() for x in temp]
        uniq_list.extend(lower)
    else:
        uniq_list.append(item.lower())
uniq_set = set(uniq_list)
uniq_list = list(uniq_set)
del uniq_list[0]
uniq_list

['cherries',
 'buckwheat',
 'trees',
 'coconut',
 'sugar beet',
 'oats',
 'vera',
 'production',
 'pork',
 'onions',
 'castor oil plant',
 'allfalfa',
 'sweet sorghum',
 'rye',
 'livestock',
 'eucalyptus',
 'sun flower',
 'peanuts',
 'groundnut',
 'sunflower',
 'other',
 'alfalfa ',
 'stevia',
 'cereal',
 'nuts',
 'mixed farming',
 'tea',
 'palm',
 'pulses',
 'grapes',
 'fruits',
 'no',
 'seed production',
 'maize ',
 'accacia',
 'bananas',
 'sunflowers',
 'palm oil',
 'cassava',
 'mustard',
 'wool',
 'oil palm',
 'herbs',
 'soybeans',
 'cereals (no specification)',
 'cotton',
 'pasture',
 'beans',
 'poultry/game',
 'almond',
 'roses',
 'grain',
 'citrus fruits',
 'passion',
 'mango',
 'raspberries',
 'sugar cane',
 'oilseeds',
 'oleagionous plant',
 'sugarcane and cassava',
 'dragon fruit',
 'ethanol',
 'peanut',
 'herbs (no specification)',
 'rice',
 'cashew',
 'vegetable',
 'date',
 'papaya',
 'tomatoes',
 'soya',
 'pinnata',
 'bamboo',
 'aquaculture',
 'sorhgum',
 'feed',
 'olives'

In [7]:
binarized_df = df_merged.copy()

for i in range (0, len(uniq_list)):
    label = uniq_list[i]
    binarized_df[label] = np.where((df_merged['crop'].str.contains(label)), 1, 0)
binarized_df

  """


Unnamed: 0,transaction_id,target_country,size_of_land,crop,label,cherries,buckwheat,trees,coconut,sugar beet,...,cattle,pongamia,fodder,seeds,banana,sugarcane,sesame,sugar,soybean,soya beans
0,,Brazil,424000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,,Argentina,320000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,,Australia,2500.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,,Sierra Leone,30000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,Liberia,220000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,,Ethiopia,10000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,,East Timor,100000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,,Papua New Guinea,33000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,,Sierra Leone,12000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,,Liberia,169000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
binarized_df.transaction_id = np.arange(start=1, stop=len(binarized_df)+1)
binarized_df.index = np.arange(len(binarized_df))

In [9]:
country_list = list(binarized_df.target_country.unique())
country_codes = {}

for i in range(0, len(country_list)):
    country_codes.update({country_list[i]:i})
    
for index, row in binarized_df.iterrows():
    curr_val = row.target_country
    binarized_df.loc[index, 'target_country'] = country_codes[curr_val]
binarized_df = binarized_df.drop(np.where(np.isnan(binarized_df))[0])
binarized_df

Unnamed: 0,transaction_id,target_country,size_of_land,crop,label,cherries,buckwheat,trees,coconut,sugar beet,...,cattle,pongamia,fodder,seeds,banana,sugarcane,sesame,sugar,soybean,soya beans
0,1,0,424000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,1,320000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,2,2500.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,3,30000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,4,220000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,5,10000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,6,100000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,8,7,33000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,3,12000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,8,169000.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
columns = ['transaction_id', 'target_country', 'size_of_land'] + uniq_list
final_df = binarized_df[columns]
final_df
labels = binarized_df['label']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(final_df, labels, stratify=labels, test_size=0.3, random_state=7)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2191, 175) (2191,) (940, 175) (940,)


In [35]:
from sklearn.model_selection import GridSearchCV

model = RandomForestClassifier(class_weight='balanced')

n_estimates_range = np.arange(start=2, stop=25)
depth_range = np.array([11,21,31,41,51])
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=7)

model_grid = [{'max_depth': depth_range, 'n_estimators': n_estimates_range}]
grid = GridSearchCV(model, model_grid, cv=skf, scoring='f1_weighted')

grid.fit(X_train, y_train)
grid.best_params_

{'max_depth': 11, 'n_estimators': 16}

In [36]:
best_model = RandomForestClassifier(n_estimators = 16, class_weight='balanced', max_depth=11)

In [38]:
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
f1_score(y_test.values, y_pred)

0.993939393939394

This model is Overfitting due to lack of sample points for the minority class.
Due to time constraints, we were not able to try to mitigate this problem. Proposed solutions could be to use a Minority Oversampling technique or collecting more data.

Additionally, we would also have liked to include more features such as:
The Regional GDP, Economic development index, human development index, land quality(arability of the land), corruption perception index, etc.

Relevant datasets are included in the attached excel file.
