In [None]:
# section for uploading file with training data
from google.colab import files
uploaded = files.upload()

Saving test.csv.zip to test.csv.zip
Saving train.csv.zip to train.csv.zip


In [None]:
!pip install ipython-autotime
%load_ext autotime

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
time: 769 µs (started: 2021-08-04 08:13:29 +00:00)


In [None]:
def read_binary_file(path):
    with open(path, 'rb') as f:
        return f.read()

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
from io import BytesIO
import os

# take information from dataframes
train_df = None
test_df = None

for file in os.listdir():
    if file == 'train.csv.zip':
        train_df = pd.read_csv(BytesIO(read_binary_file(file)), compression='zip')
    elif file == 'test.csv.zip':
        test_df = pd.read_csv(BytesIO(read_binary_file(file)), compression='zip')

# print shape of data to make sure that it was read
print(train_df.shape)
print(test_df.shape)

(878049, 9)
(884262, 7)


In [None]:
# lowest left corner point
low_x_threshold = -122.52
low_y_threshold =  36.65

# highest right corner point
high_x_threshold = -122.36
high_y_threshold =  40

# Data transformation functions

In [None]:
def get_harmonic_tuple(value, period=24):
    """
    remaps cyclical data from line axis to the circular axis ->
    important for correct data interpretation by regression
    """
    value *= 2 * np.pi / period
    return np.cos(value), np.sin(value)


def get_outlier_removed_col(df: pd.DataFrame, column, up_threshold, low_threshold):
    """
    return dataframe where column is filtered from outliers
    by removing elements that are not in given interval
    """
    return df[(df[column] < up_threshold) & (df[column] > low_threshold)]


def get_count_table(df: pd.DataFrame, category, time):
    """
    return dataframe of counting elements by groups in pair with time categories
    """
    count_df = df
    count_df["Count"] = 1
    count_df = count_df[[category, time, 'Count']]
    count_df = count_df.groupby([category, time]).agg('sum')
    count_df = count_df.reset_index()
    return count_df


def get_cols_names_below_threshold(df: pd.DataFrame, threshold):
    """
    Return array of columns that are below defined threshold
    """
    categories_below_threshold = df[df["Count"] < threshold]['Category'].unique()
    return categories_below_threshold


def get_count_table_by_street(df: pd.DataFrame):
    """
    Form new dataframe that shows crime frequency depending on street or intersection by categories
    """
    local_df = pd.DataFrame({})
    for elem in df['Street'].unique():
        if elem != 0:
            count = df[df['Street'] == elem]['Category'].value_counts()
            local_df = pd.concat([local_df, pd.Series(count, name=elem)], axis=1)
            
    return local_df


def get_nan_records(df: pd.DataFrame):
    """
    get all records that have NaN values in any column
    """
    return df[df.isnull().any(axis=1)]

In [None]:
def make_streets_intersections_cols(df: pd.DataFrame):
    """
    Form two new columns called 'street' and 'intersection' that take address by reges from
    specified column
    """
    
    intersection = df.Address.str.extract(r'(\w+\s\w+\s[/]\s\w+\s\w+)').fillna(' ')
    street       = df.Address.str.extract(r'\d+\s\w+\s\w+\s(\w+\s\w+)').fillna(' ')

    intersection = intersection.rename(columns = {0 : 'Intersection'})
    street       = street.rename(columns = {0 : 'Street'})

    df = pd.concat([df, street, intersection], axis=1)
    return df


def make_time_cols(df: pd.DataFrame, timestamp_column: str):
    """
    return dataframe with time columns formed from string-formatted 
    timestamps
    """
    df[timestamp_column] = pd.to_datetime(df[timestamp_column])
    df["Year"]           = df[timestamp_column].dt.year
    df["Month"]          = df[timestamp_column].dt.month
    df["Day"]            = df[timestamp_column].dt.date
    df["DayOfYear"]      = df[timestamp_column].dt.dayofyear
    df["Hour"]           = df[timestamp_column].dt.hour
    df["Minute"]         = df[timestamp_column].dt.minute
    return df


def make_weekday_to_num(df: pd.DataFrame, column: str):
    """
    return column where string-type weekdays will be replaced
    by numerical values
    """
    week_day_mapper = {
        'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4,
        'Saturday': 5, 'Sunday': 6,
    }

    df["weekdayNumerical"] = df[column].map(week_day_mapper).astype("int64")
    return df


def make_address_encoding_col(df: pd.DataFrame, delimeter: str, column):
    """
    Perform address encoding, where street is 1 and intersection is 0
    """
    address = df[column].apply(lambda record: any([delimeter in record]))
    df['address_encoded'] = np.fromiter(address, dtype=bool).astype(int)

    return df


def make_seasons_col(df: pd.DataFrame, column):
    """
    Form new column called 'seasons' depending on month
    """
    df['Season'] = df[column]
    df['Season'] = df['Season'].map({1 : 1, 2 : 1, 3 : 2, 4 : 2, 5 : 2, 
                                     6 : 3, 7 : 3, 8 : 3, 9 : 4, 10: 4, 11: 4, 12: 1
    })
    return df

In [None]:
def get_true_pred_perc(predictions, answers):
    """
    get percentage value of true predictions
    """
    collisions = 0
    for index in range(len(answers)):
        if answers[index] == predictions[index]:
            collisions += 1

    return collisions * 100/len(answers)

def make_submission_csv(data_to_save, columns, filename='submission_cretu.csv'):
    """
    get percentage of true predictions
    """
    submission_dataframe = pd.DataFrame(data=data_to_save, columns=columns)
    submission_dataframe['Id'] = test_df['Id'].astype('int32')
    submission_dataframe.to_csv(filename, index=False)
    

# Data preparation and transformation

Basing on most of classification results, weather data is not helping with rising accuracy of the model. Categorical data also does not rise classification accuracy. Data that shows best results - numerical one, combining elements of int and float columns.

In [None]:
# remove locational outliers
train_df = get_outlier_removed_col(train_df, 'Y', 
                up_threshold=high_y_threshold, low_threshold=low_y_threshold)
train_df = get_outlier_removed_col(train_df, 'X', 
                up_threshold=high_x_threshold, low_threshold=low_x_threshold)

# remove data duplicates
train_df.drop_duplicates(inplace=True)

# remove unnecessary columns
train_df.drop(columns=['Resolution', 'Descript'], inplace=True)

In [None]:
# form time columns extracted from string-formatted timestamps
train_df = make_time_cols(train_df, 'Dates')
test_df =  make_time_cols(test_df,  'Dates')

# make new column that transforms weekdays from string-formatted records in numerical ones
train_df = make_weekday_to_num(train_df, 'DayOfWeek')
test_df = make_weekday_to_num(test_df, 'DayOfWeek')

# form seasons basing on months
train_df = make_seasons_col(train_df, "Month")
test_df = make_seasons_col(test_df, "Month")

In [None]:
# make harmonic variables that will be necessary for performing effective analysis of periodic features
train_df['harmonicHourX'], train_df['harmonicHourY'] = get_harmonic_tuple(train_df['Hour'])
train_df['harmonicMinuteX'], train_df['harmonicMinuteY'] = get_harmonic_tuple(train_df['Minute'])
train_df['harmonicWeekdayX'], train_df['harmonicWeekdayY'] = get_harmonic_tuple(train_df['weekdayNumerical'])
train_df['harmonicDayX'], train_df['harmonicDayY'] = get_harmonic_tuple(train_df['DayOfYear'])
train_df['harmonicMonthX'], train_df['harmonicMonthY'] = get_harmonic_tuple(train_df['Month'])

test_df['harmonicHourX'], test_df['harmonicHourY'] = get_harmonic_tuple(test_df['Hour'])
test_df['harmonicMinuteX'], test_df['harmonicMinuteY'] = get_harmonic_tuple(test_df['Minute'])
test_df['harmonicWeekdayX'], test_df['harmonicWeekdayY'] = get_harmonic_tuple(test_df['weekdayNumerical'])
test_df['harmonicDayX'], test_df['harmonicDayY'] = get_harmonic_tuple(test_df['DayOfYear'])
test_df['harmonicMonthX'], test_df['harmonicMonthY'] = get_harmonic_tuple(test_df['Month'])

In [None]:
# generate new column with encoded address by values 0 and 1
train_df = make_address_encoding_col(train_df, '/', 'Address')
test_df = make_address_encoding_col(test_df, '/', 'Address')

# generate new column with extracted streets and intersections
train_df = make_streets_intersections_cols(train_df)
test_df = make_streets_intersections_cols(test_df)

# Column specifications by type, information (for models fit_transorm)

In [None]:
# columns for training and testing info
int_columns =   ['Month', 'DayOfYear', 'Hour', 'Minute', 'weekdayNumerical', 'address_encoded', 'Season', 
                 'Year']
float_columns = ['X', 'Y', 'harmonicHourX', 'harmonicHourY', 'harmonicWeekdayX', 'harmonicWeekdayY', 
                 'harmonicDayX', 'harmonicDayY', 'harmonicMonthX', 'harmonicMonthY', 'harmonicMinuteX', 
                 'harmonicMinuteY']
categorical_columns = ['DayOfWeek', 'PdDistrict', 'Address', 'Street', 'Intersection']
numerical_columns =   int_columns + float_columns


# columns that were filtered from 'bad' by authors opinion
clean_int_columns = ['address_encoded', 'Season', 'Year']
clean_float_columns = ['X', 'Y', 'harmonicHourX', 'harmonicHourY', 'harmonicWeekdayX', 'harmonicWeekdayY', 
                       'harmonicDayX', 'harmonicDayY', 'harmonicMonthX', 'harmonicMonthY', 'harmonicMinuteX', 
                       'harmonicMinuteY']
clean_categorical_columns = ['PdDistrict', 'Street', 'Intersection']
clean_numerical_columns =   clean_int_columns + clean_float_columns

# what is the target column for classification
target = 'Category'


# Check data integrity and validate transformations

**Заметка для себя**: смотри внимательно на последовательность преобразований данных в ячейке, один раз попал на косяк неправильного преобразования сезонов, потому что трансформация не успела еще сформировать месяца, на основании которых определяется сезон

In [None]:
# show all records that have NaN values in any column, if dataframes are empty, then there
# are no NaN values in dataset
print('\n\ttraining df:\n' +                    str(get_nan_records(train_df)))
print('\n\ttest df:\n' +                        str(get_nan_records(test_df)))


	training df:
Empty DataFrame
Columns: [Dates, Category, DayOfWeek, PdDistrict, Address, X, Y, Year, Month, Day, DayOfYear, Hour, Minute, weekdayNumerical, Season, harmonicHourX, harmonicHourY, harmonicMinuteX, harmonicMinuteY, harmonicWeekdayX, harmonicWeekdayY, harmonicDayX, harmonicDayY, harmonicMonthX, harmonicMonthY, address_encoded, Street, Intersection]
Index: []

	test df:
Empty DataFrame
Columns: [Id, Dates, DayOfWeek, PdDistrict, Address, X, Y]
Index: []
time: 640 ms (started: 2021-08-03 15:23:33 +00:00)


# Training models and results

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
import joblib
import lightgbm as lgb

# Logistic Regression

In [None]:
import warnings
warnings.filterwarnings("ignore")

#  set all scalers and transformers 
#  note: if you want ordinal encoder to be able to encode unknown classes write in constructor:
# "handle_unknown='use_encoded_value', unknown_value=-1"
ordinal_encoder = OrdinalEncoder()
standard_scaler = StandardScaler()
minmax = MinMaxScaler()


## LR numerical data, minmaxXY, standardization

**THIS ONE SHOWS THE BEST LOG LOSS RESULT**

In [None]:
log_model = LogisticRegression(multi_class='multinomial', solver='saga')

# make data standardization
x = train_df[numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])
x = standard_scaler.fit_transform(x)

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

# split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
log_model.fit(x_train, y_train)

# show percentage of correct results
print('>>> log loss coefficient = ' + str(log_loss(y_test, log_model.predict_proba(x_test))))
# >>> log loss coefficient = 2.522534169330668

>>> log loss coefficient = 2.522534169330668


## LR numerical + categorical, minmaxXY, standardization, ordinal encoding

In [None]:
log_cat_model = LogisticRegression(multi_class='multinomial', solver='saga')

# make numerical data standardization and categorical data ordinal encoding
x_num = train_df[numerical_columns]
x_num[['X', 'Y']] = minmax.fit_transform(x_num[['X', 'Y']])
x_num = standard_scaler.fit_transform(x_num)
x_cat = train_df[categorical_columns]
x_cat = ordinal_encoder.fit_transform(x_cat)
x = np.concatenate((x_num, x_cat), axis=1)

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

# split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
log_cat_model.fit(x_train, y_train)

# show percentage of correct results
print('>>> log loss coefficient = ' + str(log_loss(y_test, log_cat_model.predict_proba(x_test))))
# >>> log loss coefficient = 2.6964637018643476

>>> log loss coefficient = 2.6964637018643476


## LR numerical, minmaxXY

In [None]:
log_cat_model = LogisticRegression(multi_class='multinomial', solver='saga')

# make numerical data standardization and categorical data ordinal encoding
x = train_df[numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

# split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
log_cat_model.fit(x_train, y_train)

# show percentage of correct results
print('>>> log loss coefficient = ' + str(log_loss(y_test, log_cat_model.predict_proba(x_test))))
# >>> log loss coefficient = 2.592240271616436

>>> log loss coefficient = 2.592240271616436


# Random forest

## RFC minmaxXY

**THIS ONE HAS THE BEST RESULT**

In [None]:
rfc = RandomForestClassifier(max_depth = 15, min_samples_leaf = 6, max_features = 'auto', 
                             min_samples_split = 4, min_weight_fraction_leaf = 0.0, n_estimators = 700, 
                             n_jobs = -1, random_state = 42, verbose = 2)
    
x = train_df[numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
rfc.fit(x_train, y_train)

# current code execution out:
# [Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed: 13.2min finished

# RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
#                        criterion='gini', max_depth=15, max_features='auto',
#                        max_leaf_nodes=None, max_samples=None,
#                        min_impurity_decrease=0.0, min_impurity_split=None,
#                        min_samples_leaf=6, min_samples_split=4,
#                        min_weight_fraction_leaf=0.0, n_estimators=700,
#                        n_jobs=-1, oob_score=False, random_state=42, verbose=2,
#                        warm_start=False)

In [None]:
# show log loss
print('>>> log loss coefficient = ' + str(log_loss(y_test, rfc.predict_proba(x_test))))
# >>> log loss coefficient = 2.320314742290441

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    1.7s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:    7.1s
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:   16.1s
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:   28.5s
[Parallel(n_jobs=2)]: Done 700 out of 700 | elapsed:   31.0s finished


>>> log loss coefficient = 2.320314742290441


## RFC minmaxXY, standardization

In [None]:
st_rfc = RandomForestClassifier(max_depth = 15, min_samples_leaf = 6, max_features = 'auto', 
                             min_samples_split = 4, min_weight_fraction_leaf = 0.0, n_estimators = 700, 
                             n_jobs = -1, random_state = 42, verbose = 2)
                        
x = train_df[numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])
x = standard_scaler.fit_transform(x)

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
st_rfc.fit(x_train, y_train)

# current code execution out:
# [Parallel(n_jobs=-1)]: Done 700 out of 700 | elapsed: 13.8min finished

# RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
#                        criterion='gini', max_depth=15, max_features='auto',
#                        max_leaf_nodes=None, max_samples=None,
#                        min_impurity_decrease=0.0, min_impurity_split=None,
#                        min_samples_leaf=6, min_samples_split=4,
#                        min_weight_fraction_leaf=0.0, n_estimators=700,
#                        n_jobs=-1, oob_score=False, random_state=42, verbose=2,
#                        warm_start=False)

In [None]:
# show log loss
print('>>> log loss coefficient = ' + str(log_loss(y_test, st_rfc.predict_proba(x_test))))
# >>> log loss coefficient = 2.3204917991506244

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:    1.5s
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed:    6.4s
[Parallel(n_jobs=2)]: Done 361 tasks      | elapsed:   14.2s
[Parallel(n_jobs=2)]: Done 644 tasks      | elapsed:   25.1s
[Parallel(n_jobs=2)]: Done 700 out of 700 | elapsed:   27.2s finished


>>> log loss coefficient = 2.3204917991506244


# Light GBM

## LGBM numerical + categorical, minmaxXY

In [None]:
x = train_df[numerical_columns + categorical_columns]
x[categorical_columns] = ordinal_encoder.fit_transform(x[categorical_columns])
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)
y = y.flat

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
train_data = lgb.Dataset(x_train, label=y_train)

parameters = {'num_class': 39, 'objective': 'multiclass', 'metric': 'multi_logloss',
              'device': 'CPU', 'boosting_type': 'gbdt', 'learning_rate': 0.03}
num_rounds = 1000
lgb_model = lgb.train(parameters, train_data, num_rounds)

preds = lgb_model.predict(x_test)
print('>>> log loss coefficient = ' + str(log_loss(y_test, preds)))
# >>> log loss coefficient = 2.5086015150608554

>>> log loss coefficient = 2.5086015150608554


## LGBM numerical, minmaxXY, standardization

**THIS VERSION SHOWS BEST RESULT FROM ALL MODELS**

In [None]:
x = train_df[numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])
x = standard_scaler.fit_transform(x)

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)
y = y.flat

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
train_data = lgb.Dataset(x_train, label=y_train)

parameters = {'num_class': 39, 'objective': 'multiclass', 'metric': 'multi_logloss',
              'device': 'CPU', 'boosting_type': 'gbdt', 'learning_rate': 0.01,
              'verbose': True
            #   'gpu_platform_id': 1, 'gpu_device_id': 1
              }
num_rounds = 1500
lgb_st_model = lgb.train(parameters, train_data, num_rounds)

preds = lgb_st_model.predict(x_test)
print('>>> log loss coefficient = ' + str(log_loss(y_test, preds)))
# >>> log loss coefficient = 2.299090797046126

>>> log loss coefficient = 2.2741577849846024


## LGBM experimental features

In [None]:
lgbm_experimental_columns = ['Month', 'DayOfYear', 'Hour', 'Minute', 'weekdayNumerical', 
                            'address_encoded', 'Season', 'Year', 'X', 'Y']

In [None]:
x = train_df[lgbm_experimental_columns]

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)
y = y.flat

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
train_data = lgb.Dataset(x_train, label=y_train)

parameters = {'num_class': 39, 'objective': 'multiclass', 'metric': 'multi_logloss',
              'device': 'CPU', 'boosting_type': 'gbdt', 'learning_rate': 0.03}
num_rounds = 1000
lgb_st_model = lgb.train(parameters, train_data, num_rounds)

preds = lgb_st_model.predict(x_test)
print('>>> log loss coefficient = ' + str(log_loss(y_test, preds)))
# >>> log loss coefficient = 2.3044181128810974

>>> log loss coefficient = 2.3044181128810974


## LGBM experimental, standardization

In [None]:
x = train_df[lgbm_experimental_columns]
x = standard_scaler.fit_transform(x)

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)
y = y.flat

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)
train_data = lgb.Dataset(x_train, label=y_train)

parameters = {'num_class': 39, 'objective': 'multiclass', 'metric': 'multi_logloss',
              'device': 'CPU', 'boosting_type': 'gbdt', 'learning_rate': 0.03}
num_rounds = 1000
lgb_st_exp_model = lgb.train(parameters, train_data, num_rounds)

exp_preds = lgb_st_exp_model.predict(x_test)
print('>>> log loss coefficient = ' + str(log_loss(y_test, exp_preds)))


>>> log loss coefficient = 2.3183813338173564


# K-nearest neighbor

## KNN numerical minmaxXY

In [None]:
# lower amount of columns considering hard computation time of knn
knn_numerical_columns = ['address_encoded', 'Year', 'X', 'Y', 'harmonicHourX', 'harmonicHourY', 
                         'harmonicWeekdayX', 'harmonicWeekdayY', 'harmonicMonthX', 'harmonicMonthY', 
                        #  'harmonicMinuteX', 'harmonicMinuteY']
                        ]

time: 3.94 ms (started: 2021-08-03 15:24:05 +00:00)


In [None]:
knn = KNeighborsClassifier(n_neighbors=1000, metric='euclidean')

x = train_df[knn_numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)

knn.fit(x_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=1000, p=2,
                     weights='uniform')

In [None]:
predictions = knn.predict_proba(x_test)


In [None]:
print('>>> log loss coefficient = ' + str(log_loss(y_test, predictions)))
# >>> log loss coefficient = 2.6214143842491984

>>> log loss coefficient = 2.6214143842491984


## KNN numerical minmaxXY, standardization

In [None]:
knn_st = KNeighborsClassifier(n_neighbors=1000, metric='euclidean')

x = train_df[knn_numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])
x = standard_scaler.fit_transform(x)

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)

knn_st.fit(x_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=1000, p=2,
                     weights='uniform')

In [None]:
st_predictions = knn_st.predict_proba(x_test)


In [None]:
print('>>> log loss coefficient = ' + str(log_loss(y_test, st_predictions)))
# >>> log loss coefficient = 2.598232639007905

>>> log loss coefficient = 2.598232639007905


## KNN Manhattan numerical, minmaxXY

In [None]:
knn_man = KNeighborsClassifier(n_neighbors=1000, metric='manhattan')

x = train_df[knn_numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)

knn_man.fit(x_train, y_train)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='manhattan',
                     metric_params=None, n_jobs=None, n_neighbors=1000, p=2,
                     weights='uniform')

In [None]:
man_predictions = knn_man.predict_proba(x_test)


In [None]:
print('>>> log loss coefficient = ' + str(log_loss(y_test, man_predictions)))
# >>> log loss coefficient = 2.593904040150504

>>> log loss coefficient = 2.593904040150504


## KNN Manhattan numerical minmaxXY, standardization

**THIS MODEL HAS THE BEST RESULT IN KNN**

In [None]:
knn_man_st = KNeighborsClassifier(n_neighbors=2000, metric='manhattan')

x = train_df[numerical_columns]
x[['X', 'Y']] = minmax.fit_transform(x[['X', 'Y']])
x = standard_scaler.fit_transform(x)

y = train_df[target]
y = np.array(y)
y = np.reshape(y, (-1, 1))
y = ordinal_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)

knn_man_st.fit(x_train, y_train)


NameError: ignored

In [None]:
man_st_predictions = knn_man_st.predict_proba(x_test)


In [None]:
print('>>> log loss coefficient = ' + str(log_loss(y_test, man_st_predictions)))
# >>> log loss coefficient = 2.55 (this was with 1000 neighbors)