In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
train_data = pd.read_csv('..\\kaggle_data\\train.csv')
test_data = pd.read_csv('..\\kaggle_data\\test.csv')
train_data.head()

In [None]:
fig,ax = plt.subplots(1,2, figsize=(12,3))
sns.histplot(x=train_data.target, ax=ax[0])
sns.boxplot(x=train_data.target, ax=ax[1])
plt.show()

In [None]:
train_data = train_data[(np.abs(stats.zscore(train_data['target'])) < 3.0)]

In [None]:
fig,ax = plt.subplots(1,2, figsize=(12,3))
sns.histplot(x=train_data.target, ax=ax[0])
sns.boxplot(x=train_data.target, ax=ax[1])
plt.show()

In [None]:
numerical_features = [
    'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
    'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'
]

categorical_features = [
    'cat0', 'cat1', 'cat2', 'cat3', 'cat4', 
    'cat5', 'cat6', 'cat7', 'cat8', 'cat9'
]

def plot_numerical(x_train, x_test, ax):
    
    sns.histplot(x_train, ax=ax, kde=True, stat='density', alpha=0.25)
    sns.histplot(x_test, ax=ax, kde=True, stat='density', color='green', alpha=0.25)
    

def calc_frequency(d,f):
    
    df = d[f].value_counts()/d[f].value_counts().sum().astype('float')
    df = df.reset_index()
    df.columns = [f, 'count']
    return df

def plot_categorical(df_train, df_test, f=None, ax=None):
    df = pd.concat([
        calc_frequency(train_df, f).assign(_set='train'),
        calc_frequency(test_df, f).assign(_set='test')
    ])
    
    sns.barplot(data=df, x=f, y='count', hue='_set', ax=ax)
    
def plot_categorical_boxplot(df, f=None, ax=None):
    sns.boxplot(x=f, y='target', data=df, width=.6, palette="vlag", ax=ax)

In [None]:
n = len(numerical_features)

fig, axs = plt.subplots(n,1, figsize=(10,5*n))

for f, ax in zip(numerical_features, axs):
    plot_numerical(train_data[f], test_data[f], ax)

In [None]:
n = len(categorical_features)

fig, axs = plt.subplots(n,1, figsize=(10,5*n))

for f, ax in zip(categorical_features, axs):
    plot_categorical_boxplot(train_data, f=f, ax=ax)

In [None]:
alpha = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ]
alpha_conversion = {}
for index in range(0, len(alpha)):
    alpha_conversion[alpha[index]] = index / 29
alpha_conversion

In [None]:
train_data['cat0'].replace(alpha_conversion, inplace=True)
train_data['cat1'].replace(alpha_conversion, inplace=True)
train_data['cat2'].replace(alpha_conversion, inplace=True)
train_data['cat3'].replace(alpha_conversion, inplace=True)
train_data['cat4'].replace(alpha_conversion, inplace=True)
train_data['cat5'].replace(alpha_conversion, inplace=True)
train_data['cat6'].replace(alpha_conversion, inplace=True)
train_data['cat7'].replace(alpha_conversion, inplace=True)
train_data['cat8'].replace(alpha_conversion, inplace=True)
train_data['cat9'].replace(alpha_conversion, inplace=True)
train_data.head()

In [None]:
x = train_data.drop(["target","id"],axis=1)
y = train_data.target

In [None]:
import catboost as ctb
model = ctb.CatBoostRegressor(iterations=8750)
model.fit(x,y)

In [35]:
vector_data = test_data[
    ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
     'cont13']] 

vector_data['cat0'].replace(alpha_conversion, inplace=True)
vector_data['cat1'].replace(alpha_conversion, inplace=True)
vector_data['cat2'].replace(alpha_conversion, inplace=True)
vector_data['cat3'].replace(alpha_conversion, inplace=True)
vector_data['cat4'].replace(alpha_conversion, inplace=True)
vector_data['cat5'].replace(alpha_conversion, inplace=True)
vector_data['cat6'].replace(alpha_conversion, inplace=True)
vector_data['cat7'].replace(alpha_conversion, inplace=True)
vector_data['cat8'].replace(alpha_conversion, inplace=True)
vector_data['cat9'].replace(alpha_conversion, inplace=True)
vector_data.head()

preds = model.predict(vector_data)

In [36]:
result = []
x = 0
for index, row in test_data.iterrows():
    result.append([int(row['id']), preds[x]])
    x += 1

In [37]:
from datetime import datetime
dt_string = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

sub = pd.DataFrame(data=result, columns=["id", "target"])

sub.to_csv(f'..\\kaggle_data\\{dt_string}_submission.csv', index=False)