# Generate placeholder data

In [5]:
import numpy as np
import pandas as pd

In [20]:
n = 2000
n_test = 400

def generate_data(n):
    categories = ['category1', 'category2', 'category3', 'category4']
    
    column_int = np.random.randint(1, 100, n)
    column_float = np.random.randn(n)
    column_categorical = np.random.choice(categories, n)
    column_boolean = np.random.randint(0, 2, n)
    column_label = np.random.randint(0, 2, n)
    
    df = pd.DataFrame.from_dict({'column_int': column_int,\
                                 'column_float': column_float,\
                                 'column_categorical': column_categorical,\
                                 'column_boolean': column_boolean,\
                                 'column_label': column_label})
    
    return df

df_train = generate_data(n)
df_test = generate_data(n_test)
df_train.to_csv('train.csv')
df_test.to_csv('test.csv')

# Light GBM

In [46]:
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer


#
# Prepare the data
#

train = pd.read_csv('train.csv')

# get the labels
y = train.column_label.values
print(train["column_categorical"].value_counts())

train.replace({'column_categorical': {'category4': 4, 'category3': 3,'category1': 1, 'category2': 2}}, inplace=True)
print(train)
train.drop(['id', 'column_label'], inplace=True, axis=1)

x = np.array(train)

#
# Create training and validation sets
#
x, x_test, y, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

#
# Create the LightGBM data containers
#

categorical_features = [c for c, col in enumerate(train.columns) if 'cat' in col]
categorical_features = [2]
train_data = lightgbm.Dataset(x, label=y, feature_name=['1', '2', '3', '4'], categorical_feature=['3'])
test_data = lightgbm.Dataset(x_test, label=y_test)

#
# Train the model
#

parameters = {
    'application': 'binary',
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 31,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.05,
    'verbose': 0
}

model = lightgbm.train(parameters,
                       train_data,
                       valid_sets=test_data,
                       num_boost_round=5000,
                       early_stopping_rounds=100)
#
# Create a submission
#

submission = pd.read_csv('test.csv')
submission.replace({'column_categorical': {'category4': 4, 'category3': 3,'category1': 1, 'category2': 2}}, inplace=True)

ids = submission['id'].values
submission.drop(['id', 'column_label'], inplace=True, axis=1)


x = submission.values
y = model.predict(x)

output = pd.DataFrame({'id': ids, 'target': y})
output.to_csv("submission.csv", index=False)

category4    524
category3    513
category1    511
category2    452
Name: column_categorical, dtype: int64
        id  column_int  column_float  column_categorical  column_boolean  \
0        0          49     -0.623765                   2               0   
1        1          77     -2.543974                   1               1   
2        2          45      1.058069                   2               0   
3        3          67     -0.024730                   3               1   
4        4          77      1.947579                   3               1   
5        5          51      0.597288                   4               1   
6        6          29     -0.710717                   2               1   
7        7          52      1.366495                   4               0   
8        8          92     -0.099043                   3               0   
9        9           9      1.488385                   3               1   
10      10          17     -0.515218                   2 

New categorical_feature is ['3']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
