In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip /content/drive/My\ Drive/HackTheRealty.zip

Archive:  /content/drive/My Drive/HackTheRealty.zip
  inflating: HackTheRealty/E/exposition_sample_submission.tsv  
  inflating: HackTheRealty/E/exposition_test.tsv  
  inflating: HackTheRealty/E/exposition_train.tsv  
  inflating: HackTheRealty/E/fields_exposition_train.md  
  inflating: HackTheRealty/P/fields_price_housebase.md  
  inflating: HackTheRealty/P/fields_price_train.md  
  inflating: HackTheRealty/P/price_housebase.tsv  
  inflating: HackTheRealty/P/price_sample_submission.tsv  
  inflating: HackTheRealty/P/price_test.tsv  
  inflating: HackTheRealty/P/price_train.tsv  
  inflating: HackTheRealty/quadkey.py  


In [None]:
ls HackTheRealty/E

exposition_sample_submission.tsv  exposition_train.tsv
exposition_test.tsv               fields_exposition_train.md


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split


def train_lgb(X_, X_val, Y_, Y_val):
    categ_nums = Y_.unique().shape[0] + 1
    objective = "multiclass"
    params = {
        "objective": objective,
        'max_depth': 5,
        'num_leaves': 20,
        'colsample_bytre': 0.1,
        "subsample": 0.1,
        "learning_rate": 0.01,
        'n_estimators': 200,
        'num_trees': 5000,
        'num_class': categ_nums,
        'early_stopping_rounds': 100,
        'verbose': 100
    }

    lgtrain = lgb.Dataset(X_, Y_)
    lgval = lgb.Dataset(X_val, Y_val)
    gbm = lgb.train(params, lgtrain, 2000, valid_sets=[lgtrain, lgval], verbose_eval = 100)
    preds = gbm.predict(X_val)
    preds = np.argmax(preds, axis=1)
    print("preds", preds[:10])
    print("y_val", Y_val[:10])
    try:
        print("accuracy_score", accuracy_score(Y_val, preds))
        print("f1_score", f1_score(Y_val, preds))
    except:
        pass
    return gbm


train = pd.read_csv("HackTheRealty/E/exposition_train.tsv", sep="\t")
test = pd.read_csv("HackTheRealty/E/exposition_test.tsv", sep="\t")

del_cols = {"id", "target_string", "main_image", "building_id", "unified_address"}
cat_cols = {"parking", "day", "building_type", "balcony", "locality_name", "renovation", "public"}

def clean_df(df):
    df = df[[c for c in df.columns if c not in del_cols]]
    for col in df.columns:
        if col == "target":
            continue
        df[col] = df[col].fillna(-1)
    for col in cat_cols:
        df[col] = df[col].astype('category')
    return df

test["target"] = None
df = pd.concat([train, test])
df = clean_df(df)

train, test = df[~df["target"].isna()], df[df["target"].isna()]
test = test[[c for c in test.columns if c != "target"]]
train, valid = train_test_split(train, test_size=0.2, random_state=42)

train_X, train_Y = train[[c for c in train.columns if c != "target"]], train["target"]
valid_X, valid_Y = valid[[c for c in valid.columns if c != "target"]], valid["target"]
train_Y = train_Y.astype(int)
valid_Y = valid_Y.astype(int)
gbm = train_lgb(train_X, valid_X, train_Y, valid_Y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 1.48603	valid_1's multi_logloss: 1.49428
[200]	training's multi_logloss: 1.44271	valid_1's multi_logloss: 1.4576
[300]	training's multi_logloss: 1.41856	valid_1's multi_logloss: 1.43952
[400]	training's multi_logloss: 1.40279	valid_1's multi_logloss: 1.42916
[500]	training's multi_logloss: 1.39175	valid_1's multi_logloss: 1.42329
[600]	training's multi_logloss: 1.38266	valid_1's multi_logloss: 1.41904
[700]	training's multi_logloss: 1.37469	valid_1's multi_logloss: 1.41569
[800]	training's multi_logloss: 1.36807	valid_1's multi_logloss: 1.41323
[900]	training's multi_logloss: 1.36192	valid_1's multi_logloss: 1.41116
[1000]	training's multi_logloss: 1.35621	valid_1's multi_logloss: 1.40949
[1100]	training's multi_logloss: 1.35053	valid_1's multi_logloss: 1.40795
[1200]	training's multi_logloss: 1.34509	valid_1's multi_logloss: 1.40661
[1300]	training's multi_logloss: 1.33985	valid_1's multi_lo

In [None]:
test_preds = np.argmax(gbm.predict(test), axis=1)

In [None]:
submission = pd.read_csv("HackTheRealty/E/exposition_sample_submission.tsv", sep="\t")
submission["target"] = test_preds
submission.to_csv("submission_e2.tsv",index=None, sep="\t")