# Introduction

This notebook continues from [Let's get the party started](https://www.kaggle.com/ludovicoristori/vsb-data-prep-let-s-get-the-party-started), my previous kernel dedicated to data preparation.

The kernel in many sections is **still in progress**. Changes from previous version(s):** removed time samples in the datasets**, substituted with Amplitude and Phases of the first harmonics.

Please, take this **really as is**. If you have any suggestion or pointing, they are welcome.

## Updates

What I have understood probably too late:
* Signals doesn't matter. Errors matter. I.e.: main harmonics are useless.
* The three phases come toghether: it's better doing previsions basing on id_measurement than on id_signal (and for example putting the same 0 in all the three phases in the test set).

The best thing to do would be trashing this notebook and restart with a new one. But I have no time for this. Let's try to apply some partial update to this kernel and let's see what happens.

# Import

In [None]:
import numpy as np
import pandas as pd 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import gc
import numpy.fft as ft
import pyarrow.parquet as pq

In [None]:
import os
print(os.listdir("../input"))

In [None]:
import random
random.seed(12345)

In [None]:
df_train = pd.read_csv('../input/vsb-data-prep-let-s-get-the-party-started/df_train.csv')
df_train.iloc[:,0:12].head()

In [None]:
df_test = pd.read_csv('../input/vsb-data-prep-let-s-get-the-party-started/df_test.csv')
df_test.iloc[:,0:12].head()

In [None]:
sample_submission = pd.read_csv('../input/vsb-power-line-fault-detection/sample_submission.csv')
sample_submission.head()

# EDA 2.0 & Feature Engineering

Let's look at the data we have got restarting from the original data.

In [None]:
%%time
train_orig = pq.read_pandas('../input/vsb-power-line-fault-detection/train.parquet').to_pandas()

In [None]:
train_orig.iloc[:,0:12].head()

In [None]:
s0_orig=pd.DataFrame(columns=['x','y'])
s0_orig['x']=train_orig.index
s0_orig['y']=train_orig.iloc[:,0]

In [None]:
S0=ft.rfft(s0_orig['y'])
S0[0:10]

In [None]:
S0.size

In [None]:
num_harm=5
S0_Filtered=np.zeros(S0.size, dtype=np.complex_)
S0_Filtered[0:num_harm]=S0[0:num_harm]
S0_Filtered[0:5]

In [None]:
invS0_Filtered=ft.irfft(S0_Filtered)
invS0_Filtered

In [None]:
fig,ax=plt.subplots(1,1,figsize=(12,6))
ax.plot(s0_orig['y'],color='lightblue')
ax.plot(invS0_Filtered,color='blue')

Ok, let's look at what we get from df_train. Some useful functions:

In [None]:
def ComplexForm(A,p):
    z = complex(A*np.cos(p),A*np.sin(p))
    return(z)

In [None]:
def GetFFTCoeff(signal_id,df,num_harm,start_col):
    npa = int(num_harm/2)
    Z = np.zeros(npa,dtype=np.complex_)
    for j in range(0,npa) :
        A = df.iloc[signal_id,start_col+2*j]
        P = df.iloc[signal_id,start_col+2*j+1]
        Z[j] = ComplexForm(A,P)
    return(Z)

Let's try to obtain the time-dependent signal.

In [None]:
ComplexForm(-768217,0)

In [None]:
S0_df=GetFFTCoeff(0,df_train,10,8)
S0_df.shape

In [None]:
S0_dff=np.zeros(S0.size, dtype=np.complex_)
S0_dff[0:num_harm]=S0_df[0:num_harm]
S0_dff[0:5]

In [None]:
invS0_df=ft.irfft(S0_dff)*100
invS0_df

In [None]:
fig,ax=plt.subplots(1,1,figsize=(12,6))
ax.plot(s0_orig['y'],color='lightblue')
ax.plot(invS0_df,color='red')

In [None]:
df_train.iloc[0,4:7]

OK, **prepared data seem consistent**. 

The second thing to point out is that our dataset are all but simmetrical. We have **many healty examples** and **very few of damaged lines**.

In [None]:
df_train['target'].groupby(by=df_train['target']).count()

In [None]:
expexted_pct_damaged=100*sum(df_train['target']==1)/len(df_train['target'])
expexted_pct_damaged

Now let's apply the updates. At first, we transform signal rows into column rows and we remove main harmonics:

In [None]:
def remove_cols(df):
    col_to_delete=['phase','signal_id','ErrFun','ErrGen','Amp0','Amp1','Pha0','Pha1','target']
    df_0=df[df['phase']==0]
    df_0.drop(col_to_delete,axis=1,inplace=True)
    df_1=df[df['phase']==1]
    df_1.drop(col_to_delete,axis=1,inplace=True)
    df_2=df[df['phase']==2]
    df_2.drop(col_to_delete,axis=1,inplace=True)
    df_merge=df_0.merge(df_1, on='id_measurement')
    df_merge=df_merge.merge(df_2, on='id_measurement')
    return(df_merge)

In [None]:
df_train_r=remove_cols(df_train)
df_test_r=remove_cols(df_test)

 ## Modelling

Preliminary operations:

In [None]:
X=df_train_r
XT=df_test_r

In [None]:
y=df_train['target'].groupby(by=df_train['id_measurement']).first()

...and some tools:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics

Let's use one fold, just for the first attempts...

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
def damaged_ratio(Y, thr):
    dr = 100*sum(Y>=thr)/len(Y)
    return (dr)

In [None]:
damaged_ratio(Y_train, 0.5)

In [None]:
Y_valid.head()

In [None]:
damaged_ratio(Y_valid, 0.5)

In [None]:
def accuracy(Y_real,Y_pred):
    delta = (Y_real==np.round(Y_pred))
    acc = 100*delta.sum()/len(Y_real)
    return (acc)

In [None]:
def mmc(y_real_int, y_calc_int):
    cm = metrics.confusion_matrix(y_real_int,y_calc_int)
    tp = cm[0,0]
    tn = cm[1,1]
    fp = cm[0,1]
    fn = cm[1,0]
    num = tp*tn-fp*fn
    den = np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    if den==0:
        mc=-1
    else:
        mc=num/den
    return np.float64(mc)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lrm = LogisticRegression()

In [None]:
md1=lrm.fit(X_train,Y_train)

In [None]:
Y_valid1=md1.predict(X_valid)

In [None]:
plt.hist(Y_valid1)

In [None]:
accuracy(Y_valid,Y_valid1)

In [None]:
metrics.confusion_matrix(Y_valid,Y_valid1)

In [None]:
damaged_ratio(Y_valid1,0.5)

It simply puts 0 everywhere, which is good in 95% of the cases, but is not good for our competition. As the confusion matrix shows, the "only" problem of the model are false negatives, where the failure is complete.

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier()

In [None]:
md1 = rfc.fit(X_train,Y_train)

In [None]:
dfi = pd.DataFrame(md1.feature_importances_, index=X.columns)
dfi.index.name = 'feature'
dfi.columns = ['Importance']
dfi.sort_values(by='Importance',ascending=False).head(10)

In [None]:
Y_valid1 = md1.predict(X_valid)

In [None]:
plt.hist(Y_valid1)

In [None]:
accuracy(Y_valid,Y_valid1)

In [None]:
metrics.confusion_matrix(Y_valid,Y_valid1)

In [None]:
damaged_ratio(Y_valid,0.5)

In [None]:
damaged_ratio(Y_valid1,0.5)

A bit better of the Logistic Regression, but we need something more.

## Light GBM Classifier

In [None]:
import lightgbm as lgb

In [None]:
parameters = {'application': 'binary',
              'boosting': 'gbdt',
              'metric': 'auc',
              'min_data_in_leaf': 40, 
              'num_leaves': 60,
              'max_bin': 30,
              'bagging_freq': 1,
              'bagging_fraction': 0.8 ,
              'learning_rate': 0.008,
              'early_stopping_round': 100,
              'num_round': 1000,
              'verbosity':-1}
evals_result={}

In [None]:
lgb_train_set = lgb.Dataset(X_train, label=Y_train)
lgb_valid_set = lgb.Dataset(X_valid, label=Y_valid)
md1 = lgb.train(parameters,
                 train_set=lgb_train_set,
                 valid_sets=[lgb_train_set,lgb_valid_set],
                 verbose_eval=100,
                 evals_result=evals_result)
best_round=md1.best_iteration

In [None]:
Y_valid1 = md1.predict(X_valid,num_iteration=best_round)

In [None]:
plt.hist(Y_valid1)

In [None]:
accuracy(Y_valid,Y_valid1)

In this case the model's output contains probability values, instead classes. We need to define a threshold to transform the first into the second ones, and for this purpose we introduce this function:

In [None]:
def to_int_th(x,th):
    y = np.zeros(len(x))
    for i in range(0,len(x)):
        if (x[i]>=th) :
            y[i]=1
        else:
            y[i]=0
    y = y.astype(int)
    return (y)

We start using the most "natural" threshold, 0.5:

In [None]:
metrics.confusion_matrix(Y_valid,to_int_th(Y_valid1,0.5))

In [None]:
damaged_ratio(Y_valid1,0.5)

The model performs again a little better than the previous one, but it's still not good in classifying the damaged cases. But is 0.5 the right threhsold to distinct between healty and damaged? Let's introduce this function:

In [None]:
def find_thres(y_real, y_calc):
    thr_ndiv=100
    y_min=np.min(y_calc)
    y_max=np.max(y_calc)
    start_thres = (y_min+y_max)/2 # default, better than 0
    stop_thres = 1
    opt_thres=start_thres
    opt_mmc = -1
    vec_thres = np.arange(start_thres,stop_thres,(stop_thres-start_thres)/thr_ndiv)
    for thres in vec_thres:
        y_calc_int=to_int_th(y_calc,thres)
        m = mmc(y_real,y_calc_int)
        if (m > opt_mmc):
            opt_mmc = m
            opt_thres = thres
    print('opt. thres={t:.5f} mmc={m:.5f}'.format(t=opt_thres,m=opt_mmc))
    return opt_thres

In [None]:
opt_thres=find_thres(Y_valid, Y_valid1)

In [None]:
Y_valid_int=to_int_th(Y_valid1,opt_thres)
metrics.confusion_matrix(Y_valid,Y_valid_int)

Again, a little better...

   ## K-Fold Classification

In [None]:
CV_STEPS=10
KF = KFold(n_splits=CV_STEPS, shuffle=True, random_state=456)

In [None]:
thr = np.zeros(CV_STEPS)
matthews = np.zeros(CV_STEPS)
dam_ratio = np.zeros(CV_STEPS)
best_iter = np.zeros(CV_STEPS)

In [None]:
for k in range(0,CV_STEPS):
    print('CV Step {}'.format(k))
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.2)
    lgb_train_set = lgb.Dataset(X_train, label=Y_train)
    lgb_valid_set = lgb.Dataset(X_valid, label=Y_valid)
    md1 = lgb.train(parameters,
                 train_set=lgb_train_set,
                 valid_sets=[lgb_train_set,lgb_valid_set],
                 verbose_eval=100,
                 evals_result=evals_result)
    Y_valid1 = md1.predict(X_valid,num_iteration=best_round)
    Y_valid1 = pd.Series(Y_valid1,index=Y_valid.index).values
    opt_thres=find_thres(Y_valid, Y_valid1)
    thr[k]=opt_thres
    Y_valid1_int=to_int_th(Y_valid1,opt_thres)
    matthews[k] = mmc(Y_valid,Y_valid1_int)
    dam_ratio[k-1] = damaged_ratio(Y_valid1,opt_thres)
    best_iter[k-1] = md1.best_iteration

In [None]:
print('mmc: mean={m}, std={s}'.format(m=np.mean(matthews),s=np.std(matthews)))

In [None]:
mean_thr=np.mean(thr)
std_thr=np.std(thr)
print('threshold: mean={m}, std={s}'.format(m=mean_thr,s=std_thr))

In [None]:
Y_valid1_int=to_int_th(Y_valid1,mean_thr+std_thr)
mmc(Y_valid,Y_valid1_int)

In [None]:
Y_valid1_int=to_int_th(Y_valid1,mean_thr)
mmc(Y_valid,Y_valid1_int)

In [None]:
Y_valid1_int=to_int_th(Y_valid1,mean_thr-std_thr)
mmc(Y_valid,Y_valid1_int)

In [None]:
dam_ratio.mean()

In [None]:
best_iter.mean()

# Submission

In [None]:
parameters['early_stopping_round']=None

In [None]:
parameters['num_round']=best_iter.mean()

In [None]:
threshold=mean_thr

In [None]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.05, random_state=456)

In [None]:
lgb_train_set = lgb.Dataset(X_train, label=Y_train)
lgb_valid_set = lgb.Dataset(X_valid, label=Y_valid)
md1 = lgb.train(parameters,
                 train_set=lgb_train_set,
                 valid_sets=[lgb_train_set,lgb_valid_set],
                 verbose_eval=100,
                 evals_result=evals_result)
best_round=md1.best_iteration
Y_valid1 = md1.predict(X_valid,num_iteration=best_round)

In [None]:
Y_valid1 = md1.predict(X,num_iteration=best_round)

In [None]:
sns.distplot(Y_valid1, color='blue')

In [None]:
Y_pred = md1.predict(XT,num_iteration=best_round)

In [None]:
sns.distplot(Y_pred, color='red')

In [None]:
Y_pred_int=to_int_th(Y_pred,threshold)

In [None]:
np.unique(Y_pred_int,return_counts=True)

In [None]:
XT['max']=Y_pred_int
df_pred=XT[['id_measurement','max']]
df_pred.columns=['id_measurement','target']
df_pred.head()

In [None]:
df_subm=df_test[['signal_id','id_measurement']].merge(df_pred, on='id_measurement')
df_subm.drop('id_measurement',axis=1,inplace=True)
df_subm.head()

In [None]:
sum(df_subm['target'])

In [None]:
df_subm.to_csv('submission.csv', index=False)