In [80]:
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split


In [81]:
datos_final = pd.read_csv("datos_final.csv").drop("Unnamed: 0", axis = 1)

In [82]:
datos_final.head()

Unnamed: 0,BP,Departamento,Hombres,Mujeres,Total Víctimas,Valor total cofinanciación ADR,Valor Contrapartida,Hectáreas,Vigencia,CADENA_PRODUCTIVA_ADR
0,6,CHOCO,30,5,0,250000000,513500000,100,2017,PLATANO
1,17,META,31,9,0,500000000,106600000,40,2017,CAFE
2,23,ANTIOQUIA,20,5,23,401373250,355140000,69,2017,PLATANO
3,23,ANTIOQUIA,20,5,23,401373250,355140000,69,2017,PLATANO
4,25,META,16,12,10,255000000,41570000,93,2017,CAUCHO


In [83]:
datos_final.columns

Index(['BP', 'Departamento', 'Hombres', 'Mujeres', 'Total Víctimas',
       'Valor total cofinanciación ADR', 'Valor Contrapartida', 'Hectáreas',
       'Vigencia', 'CADENA_PRODUCTIVA_ADR'],
      dtype='object')

In [84]:
datos_final.CADENA_PRODUCTIVA_ADR.value_counts()

PLATANO       31
CAFE          22
MARACUYA       9
TOMATE         9
AGUACATE       8
FRIJOL         4
MAIZ           4
COCO           3
CACAO          3
PAPA           2
TANGELO        2
PERA           2
ÑAME           1
CAUCHO         1
FRESA          1
MORA           1
GRANADILLA     1
MANGO          1
Name: CADENA_PRODUCTIVA_ADR, dtype: int64

In [85]:
# Separate input features and target
y = datos_final.CADENA_PRODUCTIVA_ADR
X = datos_final.drop('CADENA_PRODUCTIVA_ADR', axis=1)

In [86]:
# setting up testing and training sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=27)

In [87]:
# concatenate our training data back together
X = pd.concat([X, y], axis=1)

In [88]:
categories = datos_final.CADENA_PRODUCTIVA_ADR.unique()

In [89]:
categories

array(['PLATANO', 'CAFE', 'CAUCHO', 'TOMATE', 'MANGO', 'CACAO', 'ÑAME',
       'MORA', 'AGUACATE', 'GRANADILLA', 'TANGELO', 'PERA', 'MAIZ',
       'PAPA', 'FRIJOL', 'MARACUYA', 'FRESA', 'COCO'], dtype=object)

In [90]:
# separate minority and majority classes
"""not_fraud = X[X.Class==0]
fraud = X[X.Class==1]"""

classes = {}
for category in categories:
    classes.update({str(category):X[X.CADENA_PRODUCTIVA_ADR==category]}) 

In [91]:
classes

{'PLATANO':       BP        Departamento  Hombres  Mujeres  Total Víctimas  \
 0      6               CHOCO       30        5               0   
 2     23           ANTIOQUIA       20        5              23   
 3     23           ANTIOQUIA       20        5              23   
 6     44               CHOCO       52       21              48   
 7     44               CHOCO       52       21              48   
 8     44               CHOCO       52       21              48   
 19    77               CHOCO       55       17              12   
 20    77               CHOCO       55       17              12   
 27   100             CORDOBA       36       30              13   
 28   101                META        4       43              17   
 37   124             BOLIVAR      179       21              45   
 41   132                META       87       61              62   
 42   132                META       87       61              62   
 45   136               CHOCO      115       12    

In [92]:
# upsample minority
categories_oversampling = categories[1:18]

In [93]:
categories_oversampling

array(['CAFE', 'CAUCHO', 'TOMATE', 'MANGO', 'CACAO', 'ÑAME', 'MORA',
       'AGUACATE', 'GRANADILLA', 'TANGELO', 'PERA', 'MAIZ', 'PAPA',
       'FRIJOL', 'MARACUYA', 'FRESA', 'COCO'], dtype=object)

In [95]:
upsample = {}
for category in categories_oversampling:
    upsample.update({str(category)+"_upsample": resample(classes[category],replace = True, n_samples =31)})

In [102]:
# combine majority and upsampled minority
upsample_df = classes["PLATANO"]
for category in categories_oversampling:
    upsample_df = pd.concat([upsample_df, upsample[str(category)+"_upsample"]])

In [105]:
len(upsample_df)

558

In [108]:
# check new class counts
upsample_df.CADENA_PRODUCTIVA_ADR.value_counts()

MARACUYA      31
CAFE          31
MAIZ          31
FRESA         31
CACAO         31
ÑAME          31
PERA          31
GRANADILLA    31
FRIJOL        31
TOMATE        31
COCO          31
PAPA          31
PLATANO       31
TANGELO       31
CAUCHO        31
MORA          31
MANGO         31
AGUACATE      31
Name: CADENA_PRODUCTIVA_ADR, dtype: int64