In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
train_dataset = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test_dataset = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")

In [None]:
train_dataset.shape

In [None]:
train_dataset.head()

In [None]:
train_dataset.isnull().any()
test_dataset.isnull().any()

In [None]:
pd.value_counts(train_dataset['cat0'])

In [None]:
def convert_to_categorical(train_dataset, test_dataset):
    cat_cols = train_dataset.columns[1:11]
    for col in cat_cols:
        le = LabelEncoder()
        le.fit(train_dataset[col])
        
        train_dataset[col]=le.transform(train_dataset[col])
        test_dataset[col]=le.transform(test_dataset[col])
    return train_dataset, test_dataset

In [None]:
train_dataset, test_dataset = convert_to_categorical(train_dataset, test_dataset)

In [None]:
train_dataset.columns

In [None]:
train_dataset.drop(['id'],axis=1,inplace=True)
test_dataset.drop(['id'],axis=1,inplace=True)

In [None]:
train_dataset.describe()

In [None]:
def scatter_plot(x,y,c=None,cmap=None):
    plt.rcParams.update({'figure.figsize':(10,8),'figure.dpi':100})
    plt.scatter(x['values'],y['values'],c=c,cmap=cmap])
    plt.title(f"Correlation between: {x['name']} - {y['name']}")
    plt.xlabel(f"{x['name']}")
    plt.ylabel(f"{y['name']}")
    if cmap is not None: plt.colorbar() 

In [None]:
x = {'name': "cont0", 'values': train_dataset['cont0']}
y = {'name': "target", "values": train_dataset['target']}
scatter_plot(x,y,train_dataset['target'],"Spectral")

Before moving on further, let's see if the input variables are correlated with one another. We will plot the heatmap for all the variables.

In [None]:
from scipy.stats import pearsonr
pearsonr(train_dataset.loc[:,'cont13'], train_dataset.loc[:,'cont6'])

In [None]:
plt.rcParams.update({'figure.figsize':(15,20),'figure.dpi':100})
fig, (ax1,ax2) = plt.subplots(2,1)
sns.heatmap(train_dataset.iloc[:,10:].corr(), annot=True, cbar_kws={"orientation": "horizontal"}, fmt='.1f', cmap='coolwarm', robust=True, ax=ax1)
sns.heatmap(test_dataset.iloc[:,10:].corr(), annot=True, cbar_kws={"orientation": "horizontal"}, fmt='.1f', cmap='coolwarm', robust=True, ax=ax2)

Heatmap for both the training and testing dataset have been plotted here. The behaviour seems similar in both the cases. That means the data in training and the testing dataset are in the same distribution. So not much to worry about them.

My plan was to remove certain variables from the input variables set, whose correlation exceeds a higher value like 0.8. But here it seems the highest correlation value between the variables is only 0.7. So now I don't think there is any point of removing one of these variables.

Most of the categorical variables have correlation 0 with any other variable. I think except cat5 and cat7, other categorical variables have 0 correlation. Even cat5, and cat7 shows 0 correlation with some variables. But we will pick these variables and see how they behave. I'm picking the pairs, cat5 and cont12, cat5 and cont5, as these pairs have higher correlation 0.7 and 0.6. Next, I'm gonna plot the scatter plot of these with the target variable.

In [None]:
sns.jointplot(data=train_dataset, x='cat4', y='target')

In [None]:
import scipy.stats as stats
stats.pointbiserialr(train_dataset.loc[:,'cat5'], train_dataset.loc[:,'cont5'])

In [None]:

for i in range(0,13):
    sns.jointplot(data=train_dataset, x=f'cont{i}', y='target', hue='cat5')

In [None]:
sns.jointplot(data=train_dataset, x='cont12', y='target', hue='cat5')
sns.jointplot(data=train_dataset, x='cont5', y='target', hue='cat5')

Here I'm plotting the joint plots, i.e. a scatter plot of the continuous variable with the target variable, with also their frequency distributions. I've also added the categorical variable into the scene, to be able to see the relation between 3 variables at a time. The colors on the plot represent the categories. For ex. the points with high cont12 values belong to 2 and 3 for cat5 variable, whereas lower cont12 values belongs to 0 or 1 categories. This shows that there is a trend or a pattern between the two variables (cat5 and cont12). A similar behaviour is also found for the other pair, i.e. cont5 and cat5 plotted next.

The plots above also show the frequency distributions of the variables. For cont12, it seems most of the values lie in the range of 0.2-0.4, where for cont5, most of the values lie in the range of 0.2-0.6. Let's also confirm this by the value counts method.

In [None]:
train_dataset[(train_dataset['cont12']>0.2)&
              (train_dataset['cont12']<0.4)].shape # almost 50% of the records in the train dataset lies in the range of 0.2 to 0.4 for cont12 variable.

In [None]:
f, ax = plt.subplots(figsize=(10,7))
sns.distplot(train_dataset['target'])
ax.xaxis.grid(False)
ax.set(ylabel="values")
ax.set(xlabel="target")
ax.set(title="target distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
len(train_dataset.columns[10:])

In [None]:
fig,ax = plt.subplots(5,3,figsize=(20,20))
for index, feature in enumerate(train_dataset.columns[10:24]):
    plt.subplot(5,3,index+1)
    sns.histplot(train_dataset[feature],color="blue",kde=True,label="train")
    sns.histplot(test_dataset[feature],color="red",kde=True,label="test")
    plt.xlabel(feature)
    plt.legend()
plt.show()

In [None]:
corr = train_dataset.iloc[:,10:24].corr()
plt.subplots(figsize=(10,10))
sns.heatmap(corr,cmap="Blues",square=True)

In [None]:
# to check outliers
for col in train_dataset.columns[10:24]:
    plt.boxplot([train_dataset[col],test_dataset[col]], labels=['train','test'])
    plt.title(col)
    plt.legend()
    plt.show()

In [None]:
plt.boxplot(train_dataset['target'])

In [None]:
def replace_outliers(data):
    for col in data.columns[10:]:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        median_ = data[col].median()
        data.loc[((data[col] < Q1 - 1.5*IQR) | (data[col] > Q3 + 1.5*IQR)), col] = median_
    return data

In [None]:
train_dataset = replace_outliers(train_dataset)
test_dataset = replace_outliers(test_dataset)

In [None]:
# to check outliers
for col in train_dataset.columns[10:24]:
    plt.boxplot([train_dataset[col],test_dataset[col]], labels=['train','test'])
    plt.title(col)
    plt.legend()
    plt.show()

In [None]:
fig,ax = plt.subplots(5,3,figsize=(20,20))
for index, feature in enumerate(train_dataset.columns[10:24]):
    plt.subplot(5,3,index+1)
    sns.histplot(train_dataset[feature],color="blue",kde=True,label="train")
    sns.histplot(test_dataset[feature],color="red",kde=True,label="test")
    plt.xlabel(feature)
    plt.legend()
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(10, 8))
sns.distplot(train_dataset['target'], color="b")
ax.xaxis.grid(False)
ax.set(ylabel="Values")
ax.set(xlabel="Target")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
import lightgbm as lgb

In [None]:
X = train_dataset.iloc[:,:-1]
Y = train_dataset.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=22)

In [None]:
LGB = lgb.LGBMRegressor(random_state=33, n_estimators=4800, min_data_per_group=5, boosting_type='gbdt',
 num_leaves=246, max_dept=-1, learning_rate=0.005, subsample_for_bin=200000,
 lambda_l1= 1.074622455507616e-05, lambda_l2= 2.0521330798729704e-06, n_jobs=-1, cat_smooth=1.0, 
 importance_type='split', metric='rmse', min_child_samples=20, min_gain_to_split=0.0, feature_fraction=0.5, 
 bagging_freq=6, min_sum_hessian_in_leaf=0.001, min_data_in_leaf=100, bagging_fraction=0.82063411)

LGB.fit(X_train, Y_train)

In [None]:
pred_LGB = LGB.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mse = np.sqrt(mean_squared_error(Y_test, pred_LGB))

In [None]:
mse

In [None]:
test_pred = LGB.predict(test_dataset)

In [None]:
test1_dataset = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
output = pd.DataFrame({'id':test1_dataset.id, 'target':test_pred})
output.to_csv('submission.csv',index=False)

In [None]:
!kaggle competitions submit -c tabular-playground-series-feb-2021 -f submission.csv -m "Message"