In [1]:
!pip install -U lightautoml
!pip install h2o
!pip install mljar-supervised

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightautoml
  Downloading lightautoml-0.3.7.1-py3-none-any.whl (319 kB)
[K     |████████████████████████████████| 319 kB 7.4 MB/s 
[?25hCollecting poetry-core<2.0.0,>=1.0.0
  Downloading poetry_core-1.2.0-py3-none-any.whl (525 kB)
[K     |████████████████████████████████| 525 kB 59.0 MB/s 
[?25hCollecting cmaes
  Downloading cmaes-0.8.2-py3-none-any.whl (15 kB)
Collecting importlib-metadata<2.0,>=1.0
  Downloading importlib_metadata-1.7.0-py2.py3-none-any.whl (31 kB)
Collecting lightgbm==3.2.1
  Downloading lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 46.6 MB/s 
Collecting json2html
  Downloading json2html-1.3.0.tar.gz (7.0 kB)
Collecting autowoe>=1.2
  Downloading AutoWoE-1.3.2-py3-none-any.whl (215 kB)
[K     |████████████████████████████████| 215 kB 54.3 MB/s 
Collecting optuna
  Downloading optuna-3.0.2-py3-no

In [2]:
import os
import re
import numpy as np
import pandas as pd
import torch
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
import h2o
from h2o.estimators import H2OKMeansEstimator
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
N_THREADS = 1
RANDOM_STATE = 42
ISSUES_SUMMARY_FIELD = "summary"
ISSUES_TARGET = "overall_worklogs"
ISSUES_SUMMARY_D2V_MODEL_PATH = "/content/drive/MyDrive/champ/collector/summary.d2v.model"
COMMENTS_TEXT_D2V_MODEL_PATH = "/content/drive/MyDrive/champ/collector/comments.d2v.model"
ISSUES_TRAIN_PATH = "/content/drive/MyDrive/champ/collector/train_issues.csv"
COMMENT_TRAIN_PATH = "/content/drive/MyDrive/champ/collector/train_comments.csv"
EMPLOYEES_PATH = "/content/drive/MyDrive/champ/collector/employees.csv"
ISSUES_TEST = "/content/drive/MyDrive/champ/collector/test_issues.csv"
COMMENT_TEST = "/content/drive/MyDrive/champ/collector/test_comments.csv"

In [4]:
# Reproducibility

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)
h2o.init(nthreads=N_THREADS)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.16" 2022-07-19; OpenJDK Runtime Environment (build 11.0.16+8-post-Ubuntu-0ubuntu118.04); OpenJDK 64-Bit Server VM (build 11.0.16+8-post-Ubuntu-0ubuntu118.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp41alp6n7
  JVM stdout: /tmp/tmp41alp6n7/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp41alp6n7/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.38.0.1
H2O_cluster_version_age:,10 days
H2O_cluster_name:,H2O_from_python_unknownUser_xttuaw
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.172 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,1


In [5]:
# Read original data

df_issues_train = pd.read_csv(ISSUES_TRAIN_PATH)
df_comment_train = pd.read_csv(COMMENT_TRAIN_PATH)
df_emp = pd.read_csv(EMPLOYEES_PATH)
df_issues_test = pd.read_csv(ISSUES_TEST)
df_comment_test = pd.read_csv(COMMENT_TEST)

In [6]:
# Merge train and test parts of ISSUES datasets
df_issues_train_without_overall_worklogs = \
  df_issues_train.loc[:, df_issues_train.columns!=ISSUES_TARGET]
df_issues_all = pd.concat([df_issues_train_without_overall_worklogs, df_issues_test])

# Merge train and test parts of COMMENTS datasets
df_comment_all = pd.concat([df_comment_train, df_comment_test])

In [7]:
def is_exists(fpath):
  if os.path.isfile(fpath): return True
  else: return False

def build_d2v(input_texts, output_path, **params):
  tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(input_texts)]
  max_epochs = params.get('max_epochs', 100)
  vec_size = params.get('vec_size', 20)
  alpha = params.get('alpha', 0.025)
  model = Doc2Vec(vector_size=vec_size,
                  alpha=alpha, 
                  min_alpha=0.00025,
                  min_count=1,
                  dm=0)
  model.build_vocab(tagged_data)
  for epoch in range(max_epochs):
      print('{1}: iteration {0}'.format(epoch, output_path))
      model.train(tagged_data,
                  total_examples=model.corpus_count,
                  epochs=model.epochs)
      # decrease the learning rate
      model.alpha -= 0.0002
      # fix the learning rate, no decay
      model.min_alpha = model.alpha
      model.save(output_path)

def to_vec(text, model, **params):
  alpha = params.get('alpha', 0.1)
  min_alpha = params.get('min_alpha', 0.0001)
  steps = params.get('steps', 50)
  def prepare_text(text):
    replace = {
      r'\[\~.*?\]': r"X_MENTION",
      r'!.*?!': r"X_SCREENSHOT",
      r'{quote}.*?{quote}': r"X_QUOTE",
      r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*': r"X_LINK",
      r'\[(.*?)\|.*?\]': r"\1"
    }
    s = text
    for k, v in replace.items():
      s = re.sub(k, v, s)
    return s
  def cast_vector(row):
    r = np.array(list(map(lambda x: x.astype('double'), row)))
    return r
  test_data = word_tokenize(prepare_text(text.lower()))
  return cast_vector(model.infer_vector(test_data, alpha=alpha, 
                                        min_alpha=min_alpha,steps=steps))

def cluster_vecs(vecs_all, vecs_train, vecs_test, **params):
  k = params.get('k', 100)
  vecs_train_h2o = h2o.H2OFrame(vecs_train)
  vecs_test_h2o = h2o.H2OFrame(vecs_test)
  kmeans = H2OKMeansEstimator(k=k,
                              estimate_k=False,
                              standardize=False,
                              seed=RANDOM_STATE)
  kmeans.train(x=vecs_all.columns.to_numpy().tolist(),
                    training_frame=vecs_train_h2o,
                    validation_frame=vecs_test_h2o)
  vcs_h2o = h2o.H2OFrame(vecs_all)
  return kmeans.predict(vcs_h2o).as_data_frame()

def cluster_by_clustered_count(dataframe, col1, col2, col1_unique, col2_unique, **params):
  k = params.get('k', 100)
  gr = dataframe.groupby([col1, col2])[col2].count()
  counts = gr.reset_index(name='count')
  def count_col1_col2(col1_value, col2_value):
    res = counts[(counts[col1] == col1_value) & (counts[col2] == col2_value)]
    if res.shape[0] > 0:
      return res.iloc[0]['count']
    else:
      return 0  
  matrix = []
  for col1_unique_value in col1_unique:
    row = []
    for col2_unique_value in col2_unique:
      row.append(count_col1_col2(col1_unique_value, col2_unique_value))
    matrix.append(row)
  pass
  df = pd.DataFrame(data=np.array(matrix))
  df_train, df_test = train_test_split(df, test_size=0.25, shuffle=False, random_state=RANDOM_STATE)
  # df_train = df.sample(frac = 0.75)
  # df_test = df.drop(df_train.index)
  df_clustered = pd.DataFrame(data=[[x] for x in col1_unique], columns=[col1])
  df_clustered[col2] = cluster_vecs(df, df_train, df_test, k=k)
  return df_clustered

def smart_fill_na(dataframe, id_field, known_fields, target_field):
  df = dataframe[[id_field] + known_fields + [target_field]]
  
  df_test = df[df[target_field].isnull()]
  df_test = df_test[df_test.columns[1:-1]]

  if df_test.shape[0] == 0:
    return dataframe

  from supervised.automl import AutoML
  automl = AutoML(eval_metric="accuracy", ml_task="multiclass_classification", random_state=RANDOM_STATE)
  X_train = df[df[target_field].notnull()]
  X_train = X_train[X_train.columns[1:-1]]
  y_train = df[df[target_field].notnull()][target_field]
  automl.fit(X_train, y_train)
  y_pred = automl.predict(df_test)

  res = df[df[target_field].isnull()]
  res = res[[id_field]]
  res['pred'] = y_pred
  res
  def get_empty_field(emp_id):
    r = res[res[id_field] == emp_id]
    if r.shape[0] > 0:
      return r.iloc[0]['pred']
    else:
      return ''
  df_restored = dataframe
  df_restored[target_field] = df_restored.apply(lambda row: get_empty_field(row[id_field]) if pd.isnull(row[target_field]) else row[target_field], axis=1)
  return df_restored

def encode_caterorical_feature(dataframe, column_name):
  from sklearn.preprocessing import OneHotEncoder
  enc = OneHotEncoder(handle_unknown='ignore')
  X = dataframe[[column_name]]
  enc.fit(X)
  position_encoded = pd.DataFrame(data=enc.transform(X).toarray(), columns=[column_name + "_" + str(v) for i,v in enumerate(enc.categories_[0])])
  #enc.categories_
  result = pd.concat([dataframe, position_encoded], axis=1).drop([column_name], axis = 1)
  return result

In [8]:
# Clustering issues by summary
if not is_exists(ISSUES_SUMMARY_D2V_MODEL_PATH):
  build_d2v(df_issues_all[ISSUES_SUMMARY_FIELD], ISSUES_SUMMARY_D2V_MODEL_PATH, 
            max_epochs=100, vec_size=20, alpha=0.025)
issues_d2v_summary_model = Doc2Vec.load(ISSUES_SUMMARY_D2V_MODEL_PATH)

summary_vecs_data = df_issues_all.apply(lambda row: to_vec(row[ISSUES_SUMMARY_FIELD], issues_d2v_summary_model, alpha=0.025, min_alpha=0.0001, steps=100), axis=1)
summary_vecs_all = pd.DataFrame(data=[vec for vec in summary_vecs_data]).add_prefix("summary_vec_")
summary_vecs_train = summary_vecs_all.iloc[:df_issues_train.shape[0],:]
summary_vecs_test = summary_vecs_all.iloc[df_issues_train.shape[0]:,:]

summary_cluster_all = cluster_vecs(summary_vecs_all, summary_vecs_train, summary_vecs_test, k=100)
summary_cluster_train = summary_cluster_all.iloc[:df_issues_train.shape[0],:]
summary_cluster_test = summary_cluster_all.iloc[df_issues_train.shape[0]:,:]

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans Model Build progress: |███████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%


In [9]:
# Clustering comments by text
if not is_exists(COMMENTS_TEXT_D2V_MODEL_PATH):
  build_d2v(df_comment_all['text'], COMMENTS_TEXT_D2V_MODEL_PATH, 
            max_epochs=100, vec_size=20, alpha=0.025)
comment_d2v_text_model = Doc2Vec.load(COMMENTS_TEXT_D2V_MODEL_PATH)

comment_vecs_data = df_comment_all.apply(lambda row: to_vec(row['text'], comment_d2v_text_model, alpha=0.025, min_alpha=0.0001, steps=100), axis=1)
comment_vecs_all = pd.DataFrame(data=[vec for vec in comment_vecs_data]).add_prefix("comment_vec_")
comment_vecs_train = comment_vecs_all.iloc[:df_comment_train.shape[0],:]
comment_vecs_test = comment_vecs_all.iloc[df_comment_train.shape[0]:,:]

comment_cluster_all = cluster_vecs(comment_vecs_all, comment_vecs_train, comment_vecs_test, k=100)
comment_cluster_train = summary_cluster_all.iloc[:df_issues_train.shape[0],:]
comment_cluster_test = summary_cluster_all.iloc[df_issues_train.shape[0]:,:]

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans Model Build progress: |███████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%


In [10]:
# Cluster employees
users_comments_clusters_data = df_comment_all[['author_id']]
users_comments_clusters_data['cluster'] = comment_cluster_all.values
clusters = comment_cluster_all['predict'].unique()
users = df_emp['id'].unique()
user_comments_cluster = cluster_by_clustered_count(users_comments_clusters_data, 'author_id', 'cluster', users, clusters, k=100)

users_issues_assignee_data = df_issues_all[['assignee_id']]
users_issues_assignee_data['cluster'] = summary_cluster_all.values
clusters = summary_cluster_all['predict'].unique()
users = df_emp['id'].unique()
issue_assignee_cluster = cluster_by_clustered_count(users_issues_assignee_data, 'assignee_id', 'cluster', users, clusters, k=100)

users_issues_cretor_data = df_issues_all[['creator_id']]
users_issues_cretor_data['cluster'] = summary_cluster_all.values
clusters = summary_cluster_all['predict'].unique()
users = df_emp['id'].unique()
issue_creator_cluster = cluster_by_clustered_count(users_issues_cretor_data, 'creator_id', 'cluster', users, clusters, k=100)

employee_cluster = pd.DataFrame()
employee_cluster['id'] = users
employee_cluster['comments_cluster'] = user_comments_cluster['cluster'].values
employee_cluster['assignee_cluster'] = issue_assignee_cluster['cluster'].values
employee_cluster['creator_cluster'] = issue_creator_cluster['cluster'].values

employee_cluster_train_tmp, employee_cluster_test_tmp = train_test_split(employee_cluster, test_size=0.25, shuffle=False, random_state=RANDOM_STATE)
employee_cluster_result = pd.DataFrame()
employee_cluster_result['id'] = employee_cluster['id']
employee_cluster_result['cluster'] = cluster_vecs(employee_cluster, employee_cluster_train_tmp, employee_cluster_test_tmp, k=100)
employee_cluster_result

# Cluster issues by comments
issues_comments_data = df_comment_all[['issue_id']]
issues_comments_data['cluster'] = comment_cluster_all.values
clusters = comment_cluster_all['predict'].unique()
issues = df_comment_all['issue_id'].unique()
issue_comments_cluster = cluster_by_clustered_count(issues_comments_data, 'issue_id', 'cluster', issues, clusters, k=100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans Model Build progress: |███████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans Model Build progress: |███████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans Model Build progress: |███████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans Model Build progress: |███████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans Model Build progress: |



███████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
kmeans prediction progress: |████████████████████████████████████████████████████| (done) 100%


In [12]:
# Restore empty fields of employees
df_emp2 = df_emp
df_emp2['comments_cluster'] = user_comments_cluster['cluster'].values
df_emp2['assignee_cluster'] = issue_assignee_cluster['cluster'].values
df_emp2['creator_cluster'] = issue_creator_cluster['cluster'].values
#df['position'] = df_emp['position']
employees = smart_fill_na(df_emp2, 'id', 
    ['is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 
     'is_added_one_to_one', 'comments_cluster', 'assignee_cluster', 
     'creator_cluster'], 
     'position')
employees = smart_fill_na(employees, 'id', 
    ['is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 
     'is_added_one_to_one', 'comments_cluster', 'assignee_cluster', 
     'creator_cluster', 'position'], 
     'hiring_type')
employees = smart_fill_na(employees, 'id', 
    ['is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 
     'is_added_one_to_one', 'comments_cluster', 'assignee_cluster', 
     'creator_cluster', 'position', 'hiring_type'], 
     'payment_type')
employees = smart_fill_na(employees, 'id', 
    ['is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 
     'is_added_one_to_one', 'comments_cluster', 'assignee_cluster', 
     'creator_cluster', 'position', 'hiring_type', 'payment_type'], 
     'salary_calculation_type')
employees = smart_fill_na(employees, 'id', 
    ['is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 
     'is_added_one_to_one', 'comments_cluster', 'assignee_cluster', 
     'creator_cluster', 'position', 'hiring_type', 'payment_type', 'salary_calculation_type'], 
     'english_level')
employees = employees.drop(["active","full_name", "salary_calculation_type", "passport", 'is_nda_signed', 'is_labor_contract_signed', 'is_added_to_internal_chats', 
     'is_added_one_to_one', 'payment_type', 'salary_calculation_type'], axis = 1)
#employees = encode_caterorical_feature(employees, 'position')
employees = encode_caterorical_feature(employees, 'hiring_type')
#employees = encode_caterorical_feature(employees, 'payment_type')
#employees = encode_caterorical_feature(employees, 'salary_calculation_type')

english_le = LabelEncoder()
english_le.fit(['A1', 'A2', 'B1', 'B2', 'C1'])
english_le.transform(['A1'])[0]
employees['english_level'] = employees.apply(lambda row: english_le.transform([row['english_level']])[0], axis=1)

employees_prefixed_a = employees[['id', 'assignee_cluster', 'comments_cluster', 'position']].add_prefix("assignee_")
employees_prefixed_c = employees[['id', 'creator_cluster', 'comments_cluster', 'position']].add_prefix("creator_")
issues_merged = pd.merge(df_issues_all, employees_prefixed_a, left_on="assignee_id", right_on="assignee_id", how='inner')
issues_merged = pd.merge(issues_merged, employees_prefixed_c, left_on="creator_id", right_on="creator_id", how='inner')
issues_merged = issues_merged.drop(['id', 'key', 'created', 'summary'], axis=1)

issues_merged_train = issues_merged.iloc[:df_issues_train.shape[0],:]
issues_merged_test = issues_merged.iloc[df_issues_train.shape[0]:,:]

AutoML directory: AutoML_2
The task is multiclass_classification with evaluation metric accuracy
AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 3 models
1_Baseline accuracy 0.081712 trained in 4.76 seconds


Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top mar

2_DecisionTree accuracy 0.198444 trained in 58.74 seconds


Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top mar

3_Linear accuracy 0.249027 trained in 57.44 seconds
* Step default_algorithms will try to check up to 3 models


`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins ca

4_Default_Xgboost accuracy 0.287938 trained in 59.85 seconds
5_Default_NeuralNetwork accuracy 0.287938 trained in 7.22 seconds


Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top margins cannot be made large enough to accommodate all axes decorations. 
Tight layout not applied. The bottom and top mar

6_Default_RandomForest accuracy 0.256809 trained in 61.62 seconds
* Step ensemble will try to check up to 1 model
Ensemble accuracy 0.29572 trained in 1.58 seconds
AutoML fit time: 287.12 seconds
AutoML best model: Ensemble
AutoML directory: AutoML_3
The task is multiclass_classification with evaluation metric accuracy
AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'ensemble']
* Step simple_algorithms will try to check up to 3 models
1_Baseline accuracy 0.569231 trained in 5.41 seconds
y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...




2_DecisionTree accuracy 0.769231 trained in 11.67 seconds
y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...
Exception while producing SHAP explanations. order must be str, not int
Continuing ...
There was an error during 3_Linear training.
Please check AutoML_3/errors.md for details.
* Step default_algorithms will try to check up to 3 models


`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.


y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...


Tight layout not applied. tight_layout cannot make axes width small enough to accommodate all axes decorations


3_Default_Xgboost accuracy 0.846154 trained in 12.03 seconds
y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...
4_Default_NeuralNetwork accuracy 0.784615 trained in 5.4 seconds
y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...
5_Default_RandomForest accuracy 0.830769 trained in 10.8 seconds
* Step ensemble will try to check up to 1 model
Ensemble accuracy 0.846154 trained in 0.14 seconds
AutoML fit time: 52.55 seconds
AutoML best model: 3_Default_Xgboost
AutoML directory: AutoML_4
The task is multiclass_classification with evaluation metric accuracy
AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', '

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.


y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...


Tight layout not applied. tight_layout cannot make axes width small enough to accommodate all axes decorations


3_Default_Xgboost accuracy 0.839286 trained in 12.02 seconds
y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...
4_Default_NeuralNetwork accuracy 0.857143 trained in 7.83 seconds
y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...
5_Default_RandomForest accuracy 0.857143 trained in 11.43 seconds
* Step ensemble will try to check up to 1 model
Ensemble accuracy 0.857143 trained in 0.14 seconds
AutoML fit time: 56.82 seconds
AutoML best model: 2_DecisionTree
AutoML directory: AutoML_5
The task is multiclass_classification with evaluation metric accuracy
AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'N

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.


y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...


Tight layout not applied. tight_layout cannot make axes width small enough to accommodate all axes decorations


3_Default_Xgboost accuracy 0.7 trained in 12.29 seconds
y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...
4_Default_NeuralNetwork accuracy 0.6 trained in 5.98 seconds
y_true and y_pred contain different number of classes 2, 4. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1]
Problem during computing permutation importance. Skipping ...
5_Default_RandomForest accuracy 0.5 trained in 11.68 seconds
* Step ensemble will try to check up to 1 model
Ensemble accuracy 0.8 trained in 0.14 seconds
AutoML fit time: 54.99 seconds
AutoML best model: 2_DecisionTree
AutoML directory: AutoML_6
The task is multiclass_classification with evaluation metric accuracy
AutoML will use algorithms: ['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Xgboost', 'Neural Network']
Auto

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.


4_Default_Xgboost accuracy 0.92 trained in 18.24 seconds
5_Default_NeuralNetwork accuracy 0.92 trained in 6.76 seconds
6_Default_RandomForest accuracy 0.72 trained in 18.97 seconds
* Step ensemble will try to check up to 1 model
Ensemble accuracy 0.92 trained in 0.18 seconds
AutoML fit time: 92.35 seconds
AutoML best model: 2_DecisionTree


In [13]:
df_result_train = issues_merged_train
df_result_train['summary_cluster'] = summary_cluster_train.values
#df_result_train['assignee_cluster'] = df_issues_train.apply(lambda row: employee_cluster_result[employee_cluster_result['id'] == row['assignee_id']].iloc[0]['cluster'], axis=1)
#df_result_train['creator_cluster'] = df_issues_train.apply(lambda row: employee_cluster_result[employee_cluster_result['id'] == row['creator_id']].iloc[0]['cluster'], axis=1)

#df_result_train['assignee_comments_cluster'] = df_issues_train.apply(lambda row: employee_cluster[employee_cluster['id'] == row['assignee_id']].iloc[0]['comments_cluster'], axis=1)
#df_result_train['assignee_assignee_cluster'] = df_issues_train.apply(lambda row: employee_cluster[employee_cluster['id'] == row['assignee_id']].iloc[0]['assignee_cluster'], axis=1)
#df_result_train['assignee_creator_cluster'] = df_issues_train.apply(lambda row: employee_cluster[employee_cluster['id'] == row['assignee_id']].iloc[0]['creator_cluster'], axis=1)

#df_result_train['creator_comments_cluster'] = df_issues_train.apply(lambda row: employee_cluster[employee_cluster['id'] == row['creator_id']].iloc[0]['comments_cluster'], axis=1)
#df_result_train['creator_assignee_cluster'] = df_issues_train.apply(lambda row: employee_cluster[employee_cluster['id'] == row['creator_id']].iloc[0]['assignee_cluster'], axis=1)
#df_result_train['creator_creator_cluster'] = df_issues_train.apply(lambda row: employee_cluster[employee_cluster['id'] == row['creator_id']].iloc[0]['creator_cluster'], axis=1)

df_result_train = df_result_train.drop(['creator_id', 'assignee_id'], axis=1)

def get_comments_cluster(issue_id):
  res = issue_comments_cluster[issue_comments_cluster['issue_id'] == issue_id]
  if res.shape[0] > 0:
    return res.iloc[0]['cluster']
  else:
    return -1
df_result_train['comments_cluster'] = df_issues_train.apply(lambda row: get_comments_cluster(row['id']), axis=1)
df_result_train[ISSUES_TARGET] = df_issues_train[ISSUES_TARGET]

automl = TabularAutoML(
    task = Task('reg', metric = lambda y_true, y_pred: r2_score(y_true, y_pred)), 
    reader_params = {'random_state': RANDOM_STATE},
)
oof_pred = automl.fit_predict(df_result_train,  roles = {'target': ISSUES_TARGET})
#df_result_train


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO:lightautoml.automl.presets.base:Task: reg

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 3600.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 4 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (9589, 10)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 3595.38 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {

In [14]:
df_result_train.columns.to_numpy()

array(['project_id', 'assignee_assignee_cluster',
       'assignee_comments_cluster', 'assignee_position',
       'creator_creator_cluster', 'creator_comments_cluster',
       'creator_position', 'summary_cluster', 'comments_cluster',
       'overall_worklogs'], dtype=object)

In [15]:
df_result_test = issues_merged_test
df_result_test['summary_cluster'] = summary_cluster_test.values
#df_result_test['assignee_cluster'] = df_issues_test.apply(lambda row: employee_cluster_result[employee_cluster_result['id'] == row['assignee_id']].iloc[0]['cluster'], axis=1)
#df_result_test['creator_cluster'] = df_issues_test.apply(lambda row: employee_cluster_result[employee_cluster_result['id'] == row['creator_id']].iloc[0]['cluster'], axis=1)

# df_result_test['assignee_comments_cluster'] = df_issues_test.apply(lambda row: employee_cluster[employee_cluster['id'] == row['assignee_id']].iloc[0]['comments_cluster'], axis=1)
# df_result_test['assignee_assignee_cluster'] = df_issues_test.apply(lambda row: employee_cluster[employee_cluster['id'] == row['assignee_id']].iloc[0]['assignee_cluster'], axis=1)
# df_result_test['assignee_creator_cluster'] = df_issues_test.apply(lambda row: employee_cluster[employee_cluster['id'] == row['assignee_id']].iloc[0]['creator_cluster'], axis=1)
# df_result_test['creator_comments_cluster'] = df_issues_test.apply(lambda row: employee_cluster[employee_cluster['id'] == row['creator_id']].iloc[0]['comments_cluster'], axis=1)
# df_result_test['creator_assignee_cluster'] = df_issues_test.apply(lambda row: employee_cluster[employee_cluster['id'] == row['creator_id']].iloc[0]['assignee_cluster'], axis=1)
# df_result_test['creator_creator_cluster'] = df_issues_test.apply(lambda row: employee_cluster[employee_cluster['id'] == row['creator_id']].iloc[0]['creator_cluster'], axis=1)
df_result_test['comments_cluster'] = df_issues_test.apply(lambda row: get_comments_cluster(row['id']), axis=1)
df_result_test = df_result_test.drop(['creator_id', 'assignee_id'], axis=1)

#df_result_test[ISSUES_TARGET] = df_issues_train[ISSUES_TARGET]

test_pred = automl.predict(df_result_test).data
test_pred = [np.floor(row[0]).astype(np.int64) for row in test_pred]

df_result = pd.DataFrame()
df_result["id"] = df_issues_test["id"]
df_result[ISSUES_TARGET] = test_pred
df_result.to_csv('solution.csv', index=False)
df_result


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,overall_worklogs
0,675975,21790
1,675972,16908
2,675965,20073
3,675961,17223
4,675955,17223
...,...,...
1065,702545,15386
1066,702528,15085
1067,702499,15386
1068,702376,15146


In [16]:
from supervised.automl import AutoML
automl = AutoML(mode="Explain", explain_level=2, eval_metric="r2", ml_task="regression", random_state=RANDOM_STATE)
X_train = df_result_train[df_result_train.columns[:-1]]
y_train = df_result_train[ISSUES_TARGET]
automl.fit(X_train, y_train)
y_pred = automl.predict(df_result_test)
y_pred = [np.floor(x).astype(np.int64) for x in y_pred]

df_result = pd.DataFrame()
df_result["id"] = df_issues_test["id"]
df_result[ISSUES_TARGET] = y_pred
df_result.to_csv('solution_mljar.csv', index=False)
df_result

AutoML directory: AutoML_7
The task is regression with evaluation metric r2
AutoML will use algorithms: ['Linear', 'Random Forest', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
* Step simple_algorithms will try to check up to 1 model
1_Linear r2 -0.01029 trained in 35.43 seconds (1-sample predict time 0.091 seconds)
* Step default_algorithms will try to check up to 5 models
[1]	train's r2: 0.0124997	validation's r2: 0.00719717
[2]	train's r2: 0.0262983	validation's r2: 0.0162908
[3]	train's r2: 0.0367675	validation's r2: 0.0230093
[4]	train's r2: 0.0473737	validation's r2: 0.030515
[5]	train's r2: 0.0579209	validation's r2: 0.0376154
[6]	train's r2: 0.0678543	validation's r2: 0.0432311
[7]	train's r2: 0.0759226	validation's r2: 0.0485774
[8]	train's r2:

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

3_Default_Xgboost r2 0.001993 trained in 50.32 seconds (1-sample predict time 0.0921 seconds)
4_Default_CatBoost r2 -0.001657 trained in 19.03 seconds (1-sample predict time 0.057 seconds)
5_Default_NeuralNetwork r2 -0.007613 trained in 9.81 seconds (1-sample predict time 0.0966 seconds)
6_Default_RandomForest r2 -0.007063 trained in 41.34 seconds (1-sample predict time 0.5757 seconds)
* Step not_so_random will try to check up to 20 models
[1]	train's r2: 0.0231292	validation's r2: 0.0134886
[2]	train's r2: 0.039387	validation's r2: 0.0271409
[3]	train's r2: 0.0521009	validation's r2: 0.0375956
[4]	train's r2: 0.0640921	validation's r2: 0.0458578
[5]	train's r2: 0.0733206	validation's r2: 0.0526201
[6]	train's r2: 0.089048	validation's r2: 0.0613032
[7]	train's r2: 0.102269	validation's r2: 0.0678856
[8]	train's r2: 0.113537	validation's r2: 0.0732707
[9]	train's r2: 0.122817	validation's r2: 0.0773937
[10]	train's r2: 0.135527	validation's r2: 0.082278
[11]	train's r2: 0.145588	valida

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

7_Xgboost r2 0.000171 trained in 47.51 seconds (1-sample predict time 0.0897 seconds)
15_CatBoost r2 -0.002862 trained in 18.7 seconds (1-sample predict time 0.057 seconds)
19_RandomForest r2 -0.007186 trained in 39.34 seconds (1-sample predict time 0.5762 seconds)
23_NeuralNetwork r2 -0.010506 trained in 11.1 seconds (1-sample predict time 0.09 seconds)
[1]	train's r2: 0.0160196	validation's r2: 0.0111382
[2]	train's r2: 0.0296162	validation's r2: 0.0231937
[3]	train's r2: 0.0404488	validation's r2: 0.032522
[4]	train's r2: 0.0507448	validation's r2: 0.0402313
[5]	train's r2: 0.0596692	validation's r2: 0.0462484
[6]	train's r2: 0.0667661	validation's r2: 0.050799
[7]	train's r2: 0.0732724	validation's r2: 0.0551737
[8]	train's r2: 0.0797005	validation's r2: 0.0595422
[9]	train's r2: 0.0855951	validation's r2: 0.0639223
[10]	train's r2: 0.0928102	validation's r2: 0.0671086
[11]	train's r2: 0.0974212	validation's r2: 0.0687081
[12]	train's r2: 0.101606	validation's r2: 0.0706528
[13]	tr

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

8_Xgboost r2 -0.003462 trained in 44.49 seconds (1-sample predict time 0.0899 seconds)
16_CatBoost r2 -0.002568 trained in 19.3 seconds (1-sample predict time 0.0554 seconds)
20_RandomForest r2 -0.00575 trained in 43.84 seconds (1-sample predict time 0.5752 seconds)
24_NeuralNetwork r2 -0.008548 trained in 16.13 seconds (1-sample predict time 0.0899 seconds)
[1]	train's r2: 0.00984778	validation's r2: 0.00591614
[2]	train's r2: 0.0200441	validation's r2: 0.0128585
[3]	train's r2: 0.0290439	validation's r2: 0.0203163
[4]	train's r2: 0.0379651	validation's r2: 0.0260837
[5]	train's r2: 0.0460361	validation's r2: 0.0310826
[6]	train's r2: 0.0531163	validation's r2: 0.0365498
[7]	train's r2: 0.0590298	validation's r2: 0.0406835
[8]	train's r2: 0.065266	validation's r2: 0.044465
[9]	train's r2: 0.0709127	validation's r2: 0.0482511
[10]	train's r2: 0.0766271	validation's r2: 0.0523704
[11]	train's r2: 0.0818131	validation's r2: 0.0557604
[12]	train's r2: 0.086369	validation's r2: 0.0588805
[

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

9_Xgboost r2 -0.001269 trained in 65.13 seconds (1-sample predict time 0.0918 seconds)
17_CatBoost r2 -0.006053 trained in 46.95 seconds (1-sample predict time 0.0558 seconds)
21_RandomForest r2 -0.008115 trained in 41.51 seconds (1-sample predict time 0.5776 seconds)
25_NeuralNetwork r2 -0.007053 trained in 18.96 seconds (1-sample predict time 0.0959 seconds)
[1]	train's r2: 0.0167389	validation's r2: 0.0087742
[2]	train's r2: 0.0320891	validation's r2: 0.0181313
[3]	train's r2: 0.0461744	validation's r2: 0.0260698
[4]	train's r2: 0.0589478	validation's r2: 0.0336319
[5]	train's r2: 0.0712374	validation's r2: 0.0405803
[6]	train's r2: 0.0820966	validation's r2: 0.0463517
[7]	train's r2: 0.0921573	validation's r2: 0.052559
[8]	train's r2: 0.101941	validation's r2: 0.0574272
[9]	train's r2: 0.110695	validation's r2: 0.0619657
[10]	train's r2: 0.119026	validation's r2: 0.066228
[11]	train's r2: 0.126987	validation's r2: 0.0703881
[12]	train's r2: 0.134314	validation's r2: 0.073916
[13]	t

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

10_Xgboost r2 -0.002043 trained in 68.76 seconds (1-sample predict time 0.0908 seconds)
18_CatBoost r2 -0.00514 trained in 25.11 seconds (1-sample predict time 0.0565 seconds)
22_RandomForest r2 -0.007171 trained in 43.7 seconds (1-sample predict time 0.5797 seconds)
26_NeuralNetwork r2 -0.006789 trained in 19.53 seconds (1-sample predict time 0.1379 seconds)
* Step golden_features will try to check up to 3 models
None 10
Add Golden Feature: project_id_diff_creator_creator_cluster
Add Golden Feature: project_id_diff_creator_comments_cluster
Add Golden Feature: comments_cluster_ratio_creator_creator_cluster
Add Golden Feature: creator_creator_cluster_sum_project_id
Add Golden Feature: project_id_ratio_creator_creator_cluster
Add Golden Feature: creator_creator_cluster_ratio_project_id
Add Golden Feature: creator_creator_cluster_ratio_comments_cluster
Add Golden Feature: creator_creator_cluster_multiply_project_id
Add Golden Feature: comments_cluster_sum_creator_creator_cluster
Add Golde

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

3_Default_Xgboost_GoldenFeatures r2 0.004112 trained in 69.04 seconds (1-sample predict time 0.1242 seconds)
[1]	train's r2: 0.0175486	validation's r2: 0.012688
[2]	train's r2: 0.0321749	validation's r2: 0.023639
[3]	train's r2: 0.0448123	validation's r2: 0.0340809
[4]	train's r2: 0.0546947	validation's r2: 0.0422255
[5]	train's r2: 0.0644007	validation's r2: 0.0486692
[6]	train's r2: 0.073611	validation's r2: 0.0530664
[7]	train's r2: 0.0819084	validation's r2: 0.0565983
[8]	train's r2: 0.0886419	validation's r2: 0.0604887
[9]	train's r2: 0.0936261	validation's r2: 0.0629311
[10]	train's r2: 0.100158	validation's r2: 0.0663426
[11]	train's r2: 0.104436	validation's r2: 0.0677895
[12]	train's r2: 0.109161	validation's r2: 0.0704577
[13]	train's r2: 0.112975	validation's r2: 0.0722221
[14]	train's r2: 0.11695	validation's r2: 0.0739004
[15]	train's r2: 0.120442	validation's r2: 0.0760969
[16]	train's r2: 0.12323	validation's r2: 0.0771971
[17]	train's r2: 0.126101	validation's r2: 0.078

Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 


[1]	train's r2: 0.0159842	validation's r2: 0.0165492
[2]	train's r2: 0.0294817	validation's r2: 0.0298174
[3]	train's r2: 0.041542	validation's r2: 0.0420177
[4]	train's r2: 0.0507785	validation's r2: 0.0515106
[5]	train's r2: 0.0605264	validation's r2: 0.0588757
[6]	train's r2: 0.0678334	validation's r2: 0.0657726
[7]	train's r2: 0.0743214	validation's r2: 0.0722917
[8]	train's r2: 0.0807723	validation's r2: 0.0772392
[9]	train's r2: 0.0858866	validation's r2: 0.0809989
[10]	train's r2: 0.0910096	validation's r2: 0.0847836
[11]	train's r2: 0.0948004	validation's r2: 0.0883002
[12]	train's r2: 0.0992204	validation's r2: 0.090772
[13]	train's r2: 0.103073	validation's r2: 0.0940761
[14]	train's r2: 0.106532	validation's r2: 0.0972197
[15]	train's r2: 0.109173	validation's r2: 0.0998783
[16]	train's r2: 0.112652	validation's r2: 0.102424
[17]	train's r2: 0.116962	validation's r2: 0.105539
[18]	train's r2: 0.119953	validation's r2: 0.108074
[19]	train's r2: 0.122215	validation's r2: 0.109

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_ob

7_Xgboost_GoldenFeatures r2 0.00259 trained in 65.84 seconds (1-sample predict time 0.1344 seconds)
* Step insert_random_feature will try to check up to 1 model


`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

3_Default_Xgboost_GoldenFeatures_RandomFeature r2 -0.000986 trained in 44.19 seconds (1-sample predict time 0.1255 seconds)
Drop features ['assignee_comments_cluster', 'project_id_diff_creator_comments_cluster', 'creator_creator_cluster_multiply_project_id', 'creator_position', 'assignee_position', 'random_feature', 'project_id', 'project_id_ratio_creator_creator_cluster', 'project_id_diff_creator_creator_cluster', 'comments_cluster_sum_creator_creator_cluster', 'creator_creator_cluster_ratio_project_id']
* Step features_selection will try to check up to 5 models


`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_ob

3_Default_Xgboost_GoldenFeatures_SelectedFeatures r2 0.001785 trained in 62.76 seconds (1-sample predict time 0.1338 seconds)
[1]	train's r2: 0.0152619	validation's r2: 0.0113831
[2]	train's r2: 0.0280248	validation's r2: 0.02214
[3]	train's r2: 0.038433	validation's r2: 0.0306639
[4]	train's r2: 0.0475451	validation's r2: 0.0384592
[5]	train's r2: 0.0556447	validation's r2: 0.0440988
[6]	train's r2: 0.0622265	validation's r2: 0.0494902
[7]	train's r2: 0.0696091	validation's r2: 0.0543544
[8]	train's r2: 0.0774739	validation's r2: 0.059497
[9]	train's r2: 0.0824788	validation's r2: 0.0632351
[10]	train's r2: 0.0873139	validation's r2: 0.0659545
[11]	train's r2: 0.0920694	validation's r2: 0.0679367
[12]	train's r2: 0.0951017	validation's r2: 0.0707341
[13]	train's r2: 0.100209	validation's r2: 0.0723543
[14]	train's r2: 0.105206	validation's r2: 0.075085
[15]	train's r2: 0.108336	validation's r2: 0.0775827
[16]	train's r2: 0.111352	validation's r2: 0.0798049
[17]	train's r2: 0.113753	va

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

27_Xgboost_GoldenFeatures r2 0.006793 trained in 73.01 seconds (1-sample predict time 0.1265 seconds)


`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_ob

28_Xgboost_GoldenFeatures r2 0.000967 trained in 65.34 seconds (1-sample predict time 0.1225 seconds)
[1]	train's r2: 0.0160196	validation's r2: 0.0111382
[2]	train's r2: 0.0296162	validation's r2: 0.0231937
[3]	train's r2: 0.0404488	validation's r2: 0.032522
[4]	train's r2: 0.0507448	validation's r2: 0.0402313
[5]	train's r2: 0.0596692	validation's r2: 0.0462484
[6]	train's r2: 0.0667661	validation's r2: 0.050799
[7]	train's r2: 0.0732724	validation's r2: 0.0551737
[8]	train's r2: 0.0797005	validation's r2: 0.0595422
[9]	train's r2: 0.0855951	validation's r2: 0.0639223
[10]	train's r2: 0.0928102	validation's r2: 0.0671086
[11]	train's r2: 0.0974212	validation's r2: 0.0687081
[12]	train's r2: 0.101606	validation's r2: 0.0706528
[13]	train's r2: 0.105648	validation's r2: 0.0724481
[14]	train's r2: 0.109315	validation's r2: 0.0741748
[15]	train's r2: 0.11333	validation's r2: 0.0762202
[16]	train's r2: 0.116312	validation's r2: 0.0783277
[17]	train's r2: 0.119563	validation's r2: 0.080373

`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different beh

39_Xgboost_GoldenFeatures r2 0.005232 trained in 82.16 seconds (1-sample predict time 0.123 seconds)


`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_ob

40_Xgboost_GoldenFeatures r2 0.002264 trained in 73.46 seconds (1-sample predict time 0.1254 seconds)


`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_ob

41_Xgboost_GoldenFeatures r2 0.005443 trained in 83.6 seconds (1-sample predict time 0.1347 seconds)


`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
Tight layout not applied. The left and right margins cannot be made large enough to accommodate all axes decorations. 
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_obj.html for details on the `custom_metric`.
`feval` is deprecated, use `custom_metric` instead.  They have different behavior when custom objective is also used.See https://xgboost.readthedocs.io/en/latest/tutorials/custom_metric_ob

42_Xgboost_GoldenFeatures r2 0.002585 trained in 77.55 seconds (1-sample predict time 0.123 seconds)
[1]	train's r2: 0.00822057	validation's r2: 0.00504684
[2]	train's r2: 0.0157728	validation's r2: 0.0117635
[3]	train's r2: 0.022338	validation's r2: 0.0175844
[4]	train's r2: 0.0287991	validation's r2: 0.0229259
[5]	train's r2: 0.0344182	validation's r2: 0.0280166
[6]	train's r2: 0.039376	validation's r2: 0.0317861
[7]	train's r2: 0.044019	validation's r2: 0.0357501
[8]	train's r2: 0.0487964	validation's r2: 0.0394322
[9]	train's r2: 0.0534667	validation's r2: 0.0428706
[10]	train's r2: 0.0585614	validation's r2: 0.0464694
[11]	train's r2: 0.0621971	validation's r2: 0.0487914
[12]	train's r2: 0.0663115	validation's r2: 0.0513918
[13]	train's r2: 0.0701681	validation's r2: 0.0537205
[14]	train's r2: 0.0735744	validation's r2: 0.055875
[15]	train's r2: 0.076973	validation's r2: 0.0579625
[16]	train's r2: 0.0796918	validation's r2: 0.0594876
[17]	train's r2: 0.0827385	validation's r2: 0.0

All-NaN slice encountered
All-NaN slice encountered
All-NaN slice encountered
All-NaN slice encountered
All-NaN slice encountered


Unnamed: 0,id,overall_worklogs
0,675975,8523
1,675972,7842
2,675965,8552
3,675961,7565
4,675955,7565
...,...,...
1065,702545,5178
1066,702528,3903
1067,702499,5178
1068,702376,5528


In [17]:
automl.to_json()

AttributeError: ignored

In [None]:
# emp_prefixed_a = df_emp.add_prefix("assignee_")
# emp_prefixed_c = df_emp.add_prefix("creator_")
# exp_issues_merged = pd.merge(df_issues_train, emp_prefixed_a, left_on="assignee_id", right_on="assignee_id", how='inner')
# exp_issues_merged = pd.merge(exp_issues_merged, emp_prefixed_c, left_on="creator_id", right_on="creator_id", how='inner')
#exp_issues_merged.corrwith(exp_issues_merged['overall_worklogs'])


# import seaborn as sns
# corr = df_result_train.corr()
