In [1]:
# Constants
CLUSTERS = 300
MAX_UNITS_VALUE = 10
EPOCHS = 10
BATCH_SIZE = 128
TARGET_ACCURACY = 0.768

RANDOM_SEED = 13

CHECKPOINT_PATH = '/checkpoint'

In [2]:
import pandas as pd
from sklearn import cluster
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Input, concatenate, BatchNormalization, Embedding, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import ModelCheckpoint

In [3]:
# Setup clusters predictor
location_clusters_predictor = cluster.KMeans(n_clusters=CLUSTERS, random_state=RANDOM_SEED)

In [4]:

def prepare_location(dataframe):
  # Form locations clusters
  locations_dataframe = dataframe[["latitude", "longitude"]]  
  locations_clusters = location_clusters_predictor.fit_predict(locations_dataframe)

  # Replace latitude and longitude columns with location column (clusters' indexes)
  dataframe["location"] = locations_clusters
  return dataframe.drop(columns=["latitude", "longitude"])

def prepare_companies(dataframe):
  temp_dataframe = pd.DataFrame({"units": dataframe.groupby("company").count().iloc[:,0]})
  # Get companies' ids where units count <= MAX_UNITS_VALUE
  companies_lower = temp_dataframe[temp_dataframe["units"] <= MAX_UNITS_VALUE]
  
  # Get companies' ids where units count <= MAX_UNITS_VALUE
  companies_higher = temp_dataframe[temp_dataframe["units"] > MAX_UNITS_VALUE]
  
  # Replace companies:
  #       - where units count <= MAX_UNITS_VALUE with 0
  #       - where units count > MAX_UNITS_VALUE with index of value companies_higher_ids + 1
  # to use values that matches [0, number_of_unique_companies]
  companies_lower_ids = list(companies_lower.index.get_level_values("company"))
  companies_higher_ids = list(companies_higher.index.get_level_values("company"))

  for i, row in dataframe.iterrows():
    if (row["company"] in companies_lower_ids):
      dataframe.at[i, "company"] = 0
    elif (row["company"] in companies_higher_ids):
      dataframe.at[i, "company"] = companies_higher_ids.index(row["company"]) + 1

  return dataframe

In [5]:
def get_train_input(dataframe):
  return [ dataframe[["fin_1", "fin_2", "fin_3", "fin_4", "is_local"]], 
           dataframe[["company"]], 
           dataframe[["type"]], 
           dataframe[["location"]]]

In [6]:
def setup_model(dataframe):
  # Setup fin pipeline
  fin_input = Input(shape=(5,))
  fin_flow = Dense(32, activation="relu")(fin_input)
  fin_flow = BatchNormalization()(fin_flow)
  fin_flow = Dense(32, activation="relu")(fin_flow)
  fin_flow = BatchNormalization()(fin_flow)

  # Setup companies pipeline
  companies_classes = len(dataframe["company"].unique()) + 1
  company_input = Input(shape=(1,))
  company_flow = Embedding((companies_classes), 128)(company_input)
  company_flow = Flatten()(company_flow)
  company_flow = Dense(32, activation="relu")(company_flow)

  # Setup types pipeline
  unique_types = dataframe["type"].nunique()
  types_input = Input(shape=(1,))
  types_flow = Embedding(unique_types, 16)(types_input)
  types_flow = Flatten()(types_flow)
  types_flow = Dense(16, activation="relu")(types_flow)

  # Setup location pipeline
  location_input = Input(shape=(1,))
  location_flow = Embedding(300, 128)(location_input)
  location_flow = Flatten()(location_flow)
  location_flow = Dense(32, activation="relu")(location_flow)

  # Setup general pipeline
  inputs = [[fin_input, fin_flow], [company_input, company_flow],
            [types_input, types_flow], [location_input, location_flow]]

  general_input = concatenate([input[1] for input in inputs])
  flow = Dense(64, activation="relu")(general_input)
  flow = BatchNormalization()(flow)
  flow = Dense(64, activation="relu")(flow)
  flow = BatchNormalization()(flow)
  flow = Dense(1, activation="sigmoid")(flow)

  model = Model(inputs=[input[0] for input in inputs], outputs=flow)
  opt = Adamax(learning_rate=1e-5)
  model.compile(loss="binary_crossentropy", optimizer=opt, metrics=["accuracy"])

  return model

In [7]:
# Read train.csv
train_dataframe = pd.read_csv("train.csv", index_col=0)
train_dataframe.head()

Unnamed: 0,latitude,longitude,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target
0,40.10891,-83.09286,8336,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,0
1,39.86542,-84.0628,18403,1,0,-1766.845055,14985.64018,477.494992,168836.215743,1
2,39.10266,-84.52468,14022,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,0
3,39.10148,-84.52341,11051,0,0,209049.99746,0.0,95.340075,103267.727546,1
4,41.06213,-81.53784,3243,0,3,8669.269507,0.0,399.421926,177532.206618,1


In [8]:
# Get total number of rows
len(train_dataframe.index)

160001

In [9]:
# Check for null values in rows
train_dataframe.isnull().sum()

latitude     0
longitude    0
company      0
is_local     0
type         0
fin_1        0
fin_2        0
fin_3        0
fin_4        0
target       0
dtype: int64

In [10]:
# Check for unique company notes
train_dataframe["company"].nunique()

3686

In [11]:
# Prepare dataframe before model training
train_dataframe = prepare_location(train_dataframe)
train_dataframe = prepare_companies(train_dataframe)

train_dataframe.head()

Unnamed: 0,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target,location
0,561,0,3,-135060.089443,86013.396489,1206.094242,52287.082257,0,48
1,1273,1,0,-1766.845055,14985.64018,477.494992,168836.215743,1,24
2,973,0,3,-177302.873693,44881.958005,1463.339889,130388.243325,0,116
3,757,0,0,209049.99746,0.0,95.340075,103267.727546,1,116
4,229,0,3,8669.269507,0.0,399.421926,177532.206618,1,245


In [12]:
# Setup model
model = setup_model(train_dataframe)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 5)]          0           []                               
                                                                                                  
 dense (Dense)                  (None, 32)           192         ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                              

In [19]:
# Setup datasets
X, y = train_dataframe.drop(columns=["target"]), train_dataframe["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=RANDOM_SEED)
validation_data = (get_train_input(X_test), y_test)

In [20]:
# Setup callback for checkpoint saving
model_checkpoint_callback = ModelCheckpoint(
    filepath = CHECKPOINT_PATH,
    save_weights_only = True,
    monitor = 'val_accuracy',
    mode = 'max',
    save_best_only = True
)

# Train model
model.fit(
    x = get_train_input(X_train), 
    y = y_train,
    validation_data = validation_data,
    epochs = EPOCHS,
    batch_size = BATCH_SIZE,
    callbacks = [model_checkpoint_callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2894312650>

In [21]:
# Reload model's weights
model.load_weights(CHECKPOINT_PATH)

# Check model result on train's test set 
loss, accuracy = model.evaluate(validation_data[0], y_test, verbose=2)
print(f"Accuracy: {100 * accuracy}%")
print(f"Expected accuracy: {100 * TARGET_ACCURACY}%")

501/501 - 1s - loss: 0.4445 - accuracy: 0.7723 - 740ms/epoch - 1ms/step
Accuracy: 77.23267078399658%
Expected accuracy: 76.8%


In [22]:
# Read test.csv
test_dataframe = pd.read_csv("test.csv", index_col=0)
test_dataframe.head()

Unnamed: 0,latitude,longitude,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target
160000,39.28431,-76.73535,18187,1,1,113033.389907,0.0,270.906219,31222.780176,0
160001,36.758509,-76.344861,11208,0,0,-87239.590275,73759.38751,759.194862,237587.544996,1
160002,43.402802,-75.2171,7437,1,4,-122084.49862,15528.109943,1039.655934,29612.346982,0
160003,41.87116,-87.84857,17362,0,1,87355.127256,0.0,214.594205,163526.475818,1
160004,42.161296,-88.129184,11515,0,3,33014.437946,0.0,379.819724,67499.397999,1


In [23]:
# Prepare dataframe before spliting to test sets
test_dataframe = prepare_location(test_dataframe)
test_dataframe = prepare_companies(test_dataframe)
X_test = test_dataframe.drop(columns="target")
y_test = test_dataframe["target"]

test_dataframe.head()

Unnamed: 0,company,is_local,type,fin_1,fin_2,fin_3,fin_4,target,location
160000,0,1,1,113033.389907,0.0,270.906219,31222.780176,0,216
160001,306,0,0,-87239.590275,73759.38751,759.194862,237587.544996,1,96
160002,0,1,4,-122084.49862,15528.109943,1039.655934,29612.346982,0,248
160003,0,0,1,87355.127256,0.0,214.594205,163526.475818,1,5
160004,316,0,3,33014.437946,0.0,379.819724,67499.397999,1,63


In [24]:
loss, accuracy = model.evaluate(get_train_input(X_test), y_test, verbose=2)
print(f"Accuracy: {100 * accuracy}%")
print(f"Expected accuracy: {100 * TARGET_ACCURACY}%")

1250/1250 - 2s - loss: 0.4487 - accuracy: 0.7709 - 2s/epoch - 2ms/step
Accuracy: 77.08500027656555%
Expected accuracy: 76.8%
