In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# section for uploading file with training data
from google.colab import files
uploaded = files.upload()

Saving test.csv.zip to test.csv.zip
Saving train.csv.zip to train.csv.zip


In [None]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
time: 155 µs (started: 2021-08-03 18:08:43 +00:00)


In [None]:
def read_binary_file(path):
    with open(path, 'rb') as f:
        return f.read()

time: 1.39 ms (started: 2021-08-03 18:08:44 +00:00)


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
from io import BytesIO
import os

# take information from dataframes
train_df = None
test_df = None

# !!!!!!! uncomment this section if it is needed to reupload files to colab !!!!!!!!
# for file in uploaded.keys():
#     if file == 'train.csv.zip':
#         train_df = pd.read_csv(BytesIO(uploaded[file]), compression='zip')
#     elif file == 'test.csv.zip':
#         test_df = pd.read_csv(BytesIO(uploaded[file]), compression='zip')

for file in os.listdir():
    if file == 'train.csv.zip':
        train_df = pd.read_csv(BytesIO(read_binary_file(file)), compression='zip')
    elif file == 'test.csv.zip':
        test_df = pd.read_csv(BytesIO(read_binary_file(file)), compression='zip')

time: 3.88 s (started: 2021-08-03 18:08:45 +00:00)


In [None]:
# lowest left corner point
low_x_threshold = -122.52
low_y_threshold =  36.65

# highest right corner point
high_x_threshold = -122.36
high_y_threshold =  40

time: 2.66 ms (started: 2021-08-03 18:08:49 +00:00)


# Functions

In [None]:
def get_harmonic_tuple(value, period=24):
    """
    remaps cyclical data from line axis to the circular axis ->
    important for correct data interpretation by regression
    """
    value *= 2 * np.pi / period
    return np.cos(value), np.sin(value)


def get_outlier_removed_col(df: pd.DataFrame, column, up_threshold, low_threshold):
    """
    return dataframe where column is filtered from outliers
    by removing elements that are not in given interval
    """
    return df[(df[column] < up_threshold) & (df[column] > low_threshold)]


def get_count_table(df: pd.DataFrame, category, time):
    """
    return dataframe of counting elements by groups in pair with time categories
    """
    count_df = df
    count_df["Count"] = 1
    count_df = count_df[[category, time, 'Count']]
    count_df = count_df.groupby([category, time]).agg('sum')
    count_df = count_df.reset_index()
    return count_df


def get_cols_names_below_threshold(df: pd.DataFrame, threshold):
    """
    Return array of columns that are below defined threshold
    """
    categories_below_threshold = df[df["Count"] < threshold]['Category'].unique()
    return categories_below_threshold


def get_count_table_by_street(df: pd.DataFrame):
    """
    Form new dataframe that shows crime frequency depending on street or intersection by categories
    """
    local_df = pd.DataFrame({})
    for elem in df['Street'].unique():
        if elem != 0:
            count = df[df['Street'] == elem]['Category'].value_counts()
            local_df = pd.concat([local_df, pd.Series(count, name=elem)], axis=1)
            
    return local_df


def get_nan_records(df: pd.DataFrame):
    """
    get all records that have NaN values in any column
    """
    return df[df.isnull().any(axis=1)]

time: 32.2 ms (started: 2021-08-03 18:08:49 +00:00)


In [None]:
def make_streets_intersections_cols(df: pd.DataFrame):
    """
    Form two new columns called 'street' and 'intersection' that take address by reges from
    specified column
    """
    
    intersection = df.Address.str.extract(r'(\w+\s\w+\s[/]\s\w+\s\w+)').fillna(' ')
    street       = df.Address.str.extract(r'\d+\s\w+\s\w+\s(\w+\s\w+)').fillna(' ')

    intersection = intersection.rename(columns = {0 : 'Intersection'})
    street       = street.rename(columns = {0 : 'Street'})

    df = pd.concat([df, street, intersection], axis=1)
    return df


def make_time_cols(df: pd.DataFrame, timestamp_column: str):
    """
    return dataframe with time columns formed from string-formatted 
    timestamps
    """
    df[timestamp_column] = pd.to_datetime(df[timestamp_column])
    df["Year"]           = df[timestamp_column].dt.year
    df["Month"]          = df[timestamp_column].dt.month
    df["Day"]            = df[timestamp_column].dt.date
    df["DayOfYear"]      = df[timestamp_column].dt.dayofyear
    df["Hour"]           = df[timestamp_column].dt.hour
    df["Minute"]         = df[timestamp_column].dt.minute
    return df


def make_weekday_to_num(df: pd.DataFrame, column: str):
    """
    return column where string-type weekdays will be replaced
    by numerical values
    """
    week_day_mapper = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4,
        'Saturday': 5, 'Sunday': 6,
    }

    df["weekdayNumerical"] = df[column].map(week_day_mapper).astype("int64")
    return df


def make_address_encoding_col(df: pd.DataFrame, delimeter: str, column):
    """
    Perform address encoding, where street is 1 and intersection is 0
    """
    address = df[column].apply(lambda record: any([delimeter in record]))
    df['address_encoded'] = np.fromiter(address, dtype=bool).astype(int)

    return df


def make_seasons_col(df: pd.DataFrame, column):
    """
    Form new column called 'seasons' depending on month
    """
    df['Season'] = df[column]
    df['Season'] = df['Season'].map({1 : 1, 2 : 1, 3 : 2, 4 : 2, 5 : 2, 
                                     6 : 3, 7 : 3, 8 : 3, 9 : 4, 10: 4, 11: 4, 12: 1
    })
    return df

time: 35.9 ms (started: 2021-08-03 18:08:49 +00:00)


In [None]:
def get_true_pred_perc(predictions, answers):
    """
    get percentage value of true predictions
    """
    collisions = 0
    for index in range(len(answers)):
        if answers[index] == predictions[index]:
            collisions += 1

    return collisions * 100/len(answers)

def make_submission_csv(data_to_save, columns, filename='submission_cretu.csv'):
    """
    get percentage of true predictions
    """
    submission_dataframe = pd.DataFrame(data=data_to_save, columns=columns)
    submission_dataframe['Id'] = test_df['Id'].astype('int32')
    submission_dataframe.to_csv(filename, index=False)
    

time: 9.03 ms (started: 2021-08-03 18:08:50 +00:00)


# Data transformations

In [None]:
# remove locational outliers
train_df = get_outlier_removed_col(train_df, 'Y', 
                up_threshold=high_y_threshold, low_threshold=low_y_threshold)
train_df = get_outlier_removed_col(train_df, 'X', 
                up_threshold=high_x_threshold, low_threshold=low_x_threshold)

# remove data duplicates
train_df.drop_duplicates(inplace=True)

# remove unnecessary columns
train_df.drop(columns=['Resolution', 'Descript'], inplace=True)

time: 949 ms (started: 2021-08-03 18:08:50 +00:00)


In [None]:
# form time columns extracted from string-formatted timestamps
train_df = make_time_cols(train_df, 'Dates')
test_df =  make_time_cols(test_df,  'Dates')

# make new column that transforms weekdays from string-formatted records in numerical ones
train_df = make_weekday_to_num(train_df, 'DayOfWeek')
test_df = make_weekday_to_num(test_df, 'DayOfWeek')

# form seasons basing on months
train_df = make_seasons_col(train_df, "Month")
test_df = make_seasons_col(test_df, "Month")

time: 2.03 s (started: 2021-08-03 18:08:51 +00:00)


In [None]:
# make harmonic variables that will be necessary for performing effective analysis of periodic features
train_df['harmonicHourX'], train_df['harmonicHourY'] = get_harmonic_tuple(train_df['Hour'])
train_df['harmonicMinuteX'], train_df['harmonicMinuteY'] = get_harmonic_tuple(train_df['Minute'])
train_df['harmonicWeekdayX'], train_df['harmonicWeekdayY'] = get_harmonic_tuple(train_df['weekdayNumerical'])
train_df['harmonicDayX'], train_df['harmonicDayY'] = get_harmonic_tuple(train_df['DayOfYear'])
train_df['harmonicMonthX'], train_df['harmonicMonthY'] = get_harmonic_tuple(train_df['Month'])

test_df['harmonicHourX'], test_df['harmonicHourY'] = get_harmonic_tuple(test_df['Hour'])
test_df['harmonicMinuteX'], test_df['harmonicMinuteY'] = get_harmonic_tuple(test_df['Minute'])
test_df['harmonicWeekdayX'], test_df['harmonicWeekdayY'] = get_harmonic_tuple(test_df['weekdayNumerical'])
test_df['harmonicDayX'], test_df['harmonicDayY'] = get_harmonic_tuple(test_df['DayOfYear'])
test_df['harmonicMonthX'], test_df['harmonicMonthY'] = get_harmonic_tuple(test_df['Month'])

time: 606 ms (started: 2021-08-03 18:08:53 +00:00)


In [None]:
# generate new column with encoded address by values 0 and 1
train_df = make_address_encoding_col(train_df, '/', 'Address')
test_df = make_address_encoding_col(test_df, '/', 'Address')

# generate new column with extracted streets and intersections
train_df = make_streets_intersections_cols(train_df)
test_df = make_streets_intersections_cols(test_df)

time: 9.4 s (started: 2021-08-03 18:08:54 +00:00)


In [None]:
# columns for training and testing info
int_columns =   ['Month', 'DayOfYear', 'Hour', 'Minute', 'weekdayNumerical', 'address_encoded', 'Season', 
                 'Year']
float_columns = ['X', 'Y', 'harmonicHourX', 'harmonicHourY', 'harmonicWeekdayX', 'harmonicWeekdayY', 
                 'harmonicDayX', 'harmonicDayY', 'harmonicMonthX', 'harmonicMonthY', 'harmonicMinuteX', 
                 'harmonicMinuteY']
categorical_columns = ['DayOfWeek', 'PdDistrict', 'Address', 'Street', 'Intersection']
numerical_columns =   int_columns + float_columns


# columns that were filtered from 'bad' by authors opinion
clean_int_columns = ['address_encoded', 'Year']
clean_float_columns = ['X', 'Y', 'harmonicHourX', 'harmonicHourY', 'harmonicWeekdayX', 'harmonicWeekdayY', 
                       'harmonicDayX', 'harmonicDayY', 'harmonicMonthX', 'harmonicMonthY', 'harmonicMinuteX', 
                       'harmonicMinuteY']
clean_numerical_columns =   clean_int_columns + clean_float_columns

# what is the target column for classification
target = 'Category'


time: 9.7 ms (started: 2021-08-03 18:09:04 +00:00)


# Importing additional encoders, scalers...

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss
import joblib

minmax = MinMaxScaler()
standard_scaler = StandardScaler()

time: 139 ms (started: 2021-08-03 18:09:04 +00:00)


# CatBoost

## Catboost install/import

In [None]:
!pip install catboost
import catboost

Collecting catboost
  Downloading catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2 MB)
[K     |████████████████████████████████| 69.2 MB 5.0 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26
time: 11.1 s (started: 2021-08-03 18:09:05 +00:00)


## CB numerical, minmaxXY, standardization, use of Mihai's parameters

In [None]:
cb = catboost.CatBoostClassifier(l2_leaf_reg = 5, 
                                learning_rate = 0.01, 
                                logging_level = 'Verbose', 
                                iterations = 10000, 
                                depth = 6, 
                                random_seed = 42,
                                loss_function = 'MultiClass',
                                task_type = 'GPU')

# input
x = train_df[numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])
x = standard_scaler.fit_transform(x)

# take elements that model must find
y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))

# input for testing
x_test = test_df[numerical_columns]
x_test[['X', 'Y']] = minmax.transform(x_test[['X', 'Y']])
x_test = standard_scaler.transform(x_test)

print(">>> starting fit process")
cb.fit(x, y)
print('>>> finished fit')

# probabilities
print('>>> start predict')
probs = cb.predict_proba(x_test)
print(probs)
print('>>> finished predict')


make_submission_csv(probs, train_df[target].unique())
joblib.dump(cb, 'catboost_numerical_minmaxXY_st.pkl')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

>>> starting fit process


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5016:	learn: 2.2434555	total: 22m 44s	remaining: 22m 34s
5017:	learn: 2.2434458	total: 22m 44s	remaining: 22m 34s
5018:	learn: 2.2434357	total: 22m 44s	remaining: 22m 34s
5019:	learn: 2.2434148	total: 22m 44s	remaining: 22m 33s
5020:	learn: 2.2433944	total: 22m 45s	remaining: 22m 33s
5021:	learn: 2.2433810	total: 22m 45s	remaining: 22m 33s
5022:	learn: 2.2433697	total: 22m 45s	remaining: 22m 33s
5023:	learn: 2.2433587	total: 22m 45s	remaining: 22m 32s
5024:	learn: 2.2433453	total: 22m 46s	remaining: 22m 32s
5025:	learn: 2.2433306	total: 22m 46s	remaining: 22m 32s
5026:	learn: 2.2433183	total: 22m 46s	remaining: 22m 32s
5027:	learn: 2.2433088	total: 22m 46s	remaining: 22m 31s
5028:	learn: 2.2432939	total: 22m 47s	remaining: 22m 31s
5029:	learn: 2.2432795	total: 22m 47s	remaining: 22m 31s
5030:	learn: 2.2432686	total: 22m 47s	remaining: 22m 30s
5031:	learn: 2.2432565	total: 22m 48s	remaining: 22m 30s
5032:	learn: 2.2432402	

['catboost_numerical_minmaxXY_st.pkl']

time: 49min 33s (started: 2021-08-03 11:30:03 +00:00)


## CB numerical, minmaxXY, Mihai's params

In [None]:
cb = catboost.CatBoostClassifier(l2_leaf_reg = 5, 
                                            learning_rate = 0.01, 
                                            logging_level = 'Verbose', 
                                            iterations = 10000, 
                                            depth = 6, 
                                            random_seed = 42,
                                            loss_function = 'MultiClass',
                                            task_type = 'GPU')

# input
x = train_df[numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])


# take elements that model must find
y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))

# input for testing
x_test = test_df[numerical_columns]
x_test[['X', 'Y']] = minmax.transform(x_test[['X', 'Y']])

print(">>> starting fit process")
cb.fit(x, y)
print('>>> finished fit')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5002:	learn: 2.2431963	total: 23m 19s	remaining: 23m 17s
5003:	learn: 2.2431833	total: 23m 19s	remaining: 23m 17s
5004:	learn: 2.2431694	total: 23m 19s	remaining: 23m 16s
5005:	learn: 2.2431562	total: 23m 20s	remaining: 23m 16s
5006:	learn: 2.2431426	total: 23m 20s	remaining: 23m 16s
5007:	learn: 2.2431255	total: 23m 20s	remaining: 23m 16s
5008:	learn: 2.2431116	total: 23m 20s	remaining: 23m 15s
5009:	learn: 2.2431012	total: 23m 21s	remaining: 23m 15s
5010:	learn: 2.2430802	total: 23m 21s	remaining: 23m 15s
5011:	learn: 2.2430595	total: 23m 21s	remaining: 23m 14s
5012:	learn: 2.2430465	total: 23m 21s	remaining: 23m 14s
5013:	learn: 2.2430400	total: 23m 22s	remaining: 23m 14s
5014:	learn: 2.2430287	total: 23m 22s	remaining: 23m 14s
5015:	learn: 2.2430161	total: 23m 22s	remaining: 23m 13s
5016:	learn: 2.2430017	total: 23m 23s	remaining: 23m 13s
5017:	learn: 2.2429911	total: 23m 23s	remaining: 23m 13s
5018:	learn: 2.2429823	

In [None]:
# probabilities
print('>>> start predict')
nonst_probs = cb.predict_proba(x_test)
print(nonst_probs)
print('>>> finished predict')


make_submission_csv(nonst_probs, train_df[target].unique(), 'submission_catboost_minmaxXY.csv')
joblib.dump(cb, 'catboost_numerical_minmaxXY.pkl')

>>> start predict
[[2.90383203e-03 1.03957556e-01 4.70198109e-06 ... 2.44135966e-01
  3.20250190e-02 2.54791762e-02]
 [1.08105769e-03 3.91794201e-02 1.64309347e-06 ... 7.46804512e-03
  8.70963612e-02 3.91088821e-02]
 [5.31373927e-03 1.43007521e-01 1.21768262e-05 ... 3.48321723e-02
  2.25765725e-02 7.08900345e-03]
 ...
 [9.84866119e-04 8.83046496e-02 1.61092680e-03 ... 1.52289421e-01
  1.40587410e-02 1.50976832e-03]
 [1.96686157e-03 8.61718337e-02 6.29889198e-03 ... 7.21309147e-02
  2.02668940e-02 4.92676688e-03]
 [7.53801366e-04 3.62339308e-02 1.68928506e-03 ... 7.43703316e-02
  6.57891178e-03 1.31859263e-03]]
>>> finished predict


['catboost_numerical_minmaxXY.pkl']

time: 4min 1s (started: 2021-08-03 15:21:21 +00:00)


In [None]:
catboost_pure = catboost.CatBoostClassifier(l2_leaf_reg = 5, 
                                            learning_rate = 0.01, 
                                            logging_level = 'Verbose', 
                                            iterations = 10000, 
                                            depth = 6, 
                                            random_seed = 42,
                                            loss_function = 'MultiClass',
                                            task_type = 'GPU')

# input
x = train_df[numerical_columns]

# take elements that model must find
y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))

# input for testing
x_test = test_df[numerical_columns]

print(">>> starting fit process")
catboost_pure.fit(x, y)
print('>>> finished fit')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5002:	learn: 2.2430003	total: 23m 22s	remaining: 23m 21s
5003:	learn: 2.2429886	total: 23m 23s	remaining: 23m 20s
5004:	learn: 2.2429747	total: 23m 23s	remaining: 23m 20s
5005:	learn: 2.2429619	total: 23m 23s	remaining: 23m 20s
5006:	learn: 2.2429480	total: 23m 23s	remaining: 23m 20s
5007:	learn: 2.2429303	total: 23m 24s	remaining: 23m 19s
5008:	learn: 2.2429198	total: 23m 24s	remaining: 23m 19s
5009:	learn: 2.2429083	total: 23m 24s	remaining: 23m 19s
5010:	learn: 2.2428904	total: 23m 25s	remaining: 23m 18s
5011:	learn: 2.2428707	total: 23m 25s	remaining: 23m 18s
5012:	learn: 2.2428602	total: 23m 25s	remaining: 23m 18s
5013:	learn: 2.2428530	total: 23m 25s	remaining: 23m 18s
5014:	learn: 2.2428364	total: 23m 26s	remaining: 23m 17s
5015:	learn: 2.2428251	total: 23m 26s	remaining: 23m 17s
5016:	learn: 2.2428094	total: 23m 26s	remaining: 23m 17s
5017:	learn: 2.2427980	total: 23m 27s	remaining: 23m 16s
5018:	learn: 2.2427859	

In [None]:
# probabilities
print('>>> start predict')
pure_probs = cb.predict_proba(x_test)
print(pure_probs)
print('>>> finished predict')

make_submission_csv(pure_probs, train_df[target].unique(), 'submission_catboost_pure.csv')
joblib.dump(cb, 'catboost_numerical_pure.pkl')

>>> start predict
[[1.81895066e-03 3.35735146e-02 9.46516818e-06 ... 4.98892025e-02
  1.05505448e-02 1.68419276e-03]
 [1.25897265e-03 4.22056505e-02 1.36738369e-05 ... 1.21733360e-02
  4.04549520e-02 2.84666204e-03]
 [1.98786950e-03 6.01468241e-02 1.04786175e-05 ... 1.42357883e-02
  1.37545470e-02 2.42465272e-03]
 ...
 [8.51399796e-04 4.15282436e-02 2.39499585e-03 ... 2.90996580e-02
  6.22667488e-03 1.31264054e-03]
 [8.51399796e-04 4.15282436e-02 2.39499585e-03 ... 2.90996580e-02
  6.22667488e-03 1.31264054e-03]
 [8.51399796e-04 4.15282436e-02 2.39499585e-03 ... 2.90996580e-02
  6.22667488e-03 1.31264054e-03]]
>>> finished predict


['catboost_numerical_pure.pkl']

time: 3min 42s (started: 2021-08-03 16:18:16 +00:00)


## Experiment: what if we'll change columns for catboost?

In [None]:
catboost_experimental_columns = ['Month', 'DayOfYear', 'Hour', 'Minute', 'weekdayNumerical', 
                                 'address_encoded', 'Season', 'Year', 'X', 'Y']

time: 2.1 ms (started: 2021-08-03 18:09:16 +00:00)


In [None]:
catboost_pure = catboost.CatBoostClassifier(l2_leaf_reg = 5, 
                                            learning_rate = 0.01, 
                                            logging_level = 'Verbose', 
                                            iterations = 10000, 
                                            depth = 6, 
                                            random_seed = 42,
                                            loss_function = 'MultiClass',
                                            task_type = 'GPU')

# input
x = train_df[catboost_experimental_columns]

# take elements that model must find
y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))

# input for testing
x_test = test_df[catboost_experimental_columns]

print(">>> starting fit process")
catboost_pure.fit(x, y)
print('>>> finished fit')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5002:	learn: 2.2521447	total: 11m 37s	remaining: 11m 36s
5003:	learn: 2.2521265	total: 11m 37s	remaining: 11m 36s
5004:	learn: 2.2521091	total: 11m 37s	remaining: 11m 36s
5005:	learn: 2.2520986	total: 11m 37s	remaining: 11m 35s
5006:	learn: 2.2520880	total: 11m 37s	remaining: 11m 35s
5007:	learn: 2.2520723	total: 11m 37s	remaining: 11m 35s
5008:	learn: 2.2520610	total: 11m 38s	remaining: 11m 35s
5009:	learn: 2.2520506	total: 11m 38s	remaining: 11m 35s
5010:	learn: 2.2520335	total: 11m 38s	remaining: 11m 35s
5011:	learn: 2.2520175	total: 11m 38s	remaining: 11m 35s
5012:	learn: 2.2520071	total: 11m 38s	remaining: 11m 34s
5013:	learn: 2.2519994	total: 11m 38s	remaining: 11m 34s
5014:	learn: 2.2519914	total: 11m 38s	remaining: 11m 34s
5015:	learn: 2.2519818	total: 11m 39s	remaining: 11m 34s
5016:	learn: 2.2519641	total: 11m 39s	remaining: 11m 34s
5017:	learn: 2.2519512	total: 11m 39s	remaining: 11m 34s
5018:	learn: 2.2519394	

In [None]:
# probabilities
print('>>> start predict')
exp_probs = catboost_pure.predict_proba(x_test)
print(exp_probs)
print('>>> finished predict')

make_submission_csv(exp_probs, train_df[target].unique(), 'submission_catboost_experiment.csv')
joblib.dump(catboost_pure, 'catboost_numerical_experiment.pkl')

>>> start predict
[[2.05357974e-03 9.12145802e-02 8.51108354e-06 ... 2.27713238e-01
  3.14590346e-02 2.64109259e-02]
 [6.87573242e-04 4.21458787e-02 2.10910459e-06 ... 6.87423207e-03
  9.32046378e-02 3.43879210e-02]
 [6.53816568e-03 1.54048318e-01 5.48491145e-05 ... 3.27693176e-02
  2.32942106e-02 7.75652749e-03]
 ...
 [1.32813974e-03 9.23965250e-02 2.89189408e-03 ... 2.46655335e-01
  1.44023321e-02 1.39074447e-03]
 [2.34604477e-03 7.96629170e-02 9.57376405e-03 ... 9.77152094e-02
  2.26475936e-02 3.35820580e-03]
 [7.93527697e-04 3.38387385e-02 4.65080616e-03 ... 1.31894909e-01
  9.17035073e-03 2.04425099e-03]]
>>> finished predict


['catboost_numerical_experiment.pkl']

time: 832 ms (started: 2021-08-03 18:41:04 +00:00)


## What if experimental with standardization?

In [None]:
catboost_pure = catboost.CatBoostClassifier(l2_leaf_reg = 5, 
                                            learning_rate = 0.01, 
                                            logging_level = 'Verbose', 
                                            iterations = 10000, 
                                            depth = 6, 
                                            random_seed = 42,
                                            loss_function = 'MultiClass',
                                            task_type = 'GPU')

# input
x = train_df[catboost_experimental_columns]
x = standard_scaler.fit_transform(x)

# take elements that model must find
y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))

# input for testing
x_test = test_df[catboost_experimental_columns]

print(">>> starting fit process")
catboost_pure.fit(x, y)
print('>>> finished fit')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
5002:	learn: 2.2523981	total: 11m 41s	remaining: 11m 40s
5003:	learn: 2.2523853	total: 11m 41s	remaining: 11m 40s
5004:	learn: 2.2523714	total: 11m 41s	remaining: 11m 40s
5005:	learn: 2.2523578	total: 11m 41s	remaining: 11m 40s
5006:	learn: 2.2523495	total: 11m 41s	remaining: 11m 39s
5007:	learn: 2.2523341	total: 11m 42s	remaining: 11m 39s
5008:	learn: 2.2523218	total: 11m 42s	remaining: 11m 39s
5009:	learn: 2.2523078	total: 11m 42s	remaining: 11m 39s
5010:	learn: 2.2522957	total: 11m 42s	remaining: 11m 39s
5011:	learn: 2.2522769	total: 11m 42s	remaining: 11m 39s
5012:	learn: 2.2522681	total: 11m 42s	remaining: 11m 39s
5013:	learn: 2.2522577	total: 11m 42s	remaining: 11m 39s
5014:	learn: 2.2522450	total: 11m 43s	remaining: 11m 38s
5015:	learn: 2.2522369	total: 11m 43s	remaining: 11m 38s
5016:	learn: 2.2522295	total: 11m 43s	remaining: 11m 38s
5017:	learn: 2.2522185	total: 11m 43s	remaining: 11m 38s
5018:	learn: 2.2522063	

In [None]:
# probabilities
print('>>> start predict')
exp_st_probs = catboost_pure.predict_proba(x_test)
print(exp_st_probs)
print('>>> finished predict')

make_submission_csv(exp_st_probs, train_df[target].unique(), 'submission_catboost_experiment_st.csv')
joblib.dump(catboost_pure, 'catboost_numerical_experiment_st.pkl')

>>> start predict
[[1.88421455e-03 2.63544743e-02 8.04119644e-06 ... 1.92900900e-02
  7.42241076e-03 1.11631717e-03]
 [6.63237677e-04 1.41210680e-02 5.69133914e-06 ... 2.23197724e-02
  1.11436034e-02 1.07718874e-03]
 [1.88421455e-03 2.63544743e-02 8.04119644e-06 ... 1.92900900e-02
  7.42241076e-03 1.11631717e-03]
 ...
 [6.20686337e-04 1.04394249e-02 8.18348106e-06 ... 4.11712599e-03
  1.28749807e-02 9.57942219e-04]
 [6.20686337e-04 1.04394249e-02 8.18348106e-06 ... 4.11712599e-03
  1.28749807e-02 9.57942219e-04]
 [6.20686337e-04 1.04394249e-02 8.18348106e-06 ... 4.11712599e-03
  1.28749807e-02 9.57942219e-04]]
>>> finished predict


['catboost_numerical_experiment_st.pkl']

time: 3min 3s (started: 2021-08-03 19:10:48 +00:00)
