In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import RepeatedStratifiedKFold, KFold

from sklearn import preprocessing

import os
print(os.listdir("../input"))


import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Overfitting
> Overfitting is a modelling error, which occurs when a function is too closely fit to a limited set of data points

If a model matches the training data almost perfectly, but does poorly on validation and other new data.

![](https://upload.wikimedia.org/wikipedia/commons/thumb/1/19/Overfitting.svg/320px-Overfitting.svg.png)

## Detecting Overfitting

With machine learning in general, we cant know how well our model performed untill we perform on unseen data. So we basically split our dataset into train and test to check the performance of model, whether it is overfitting or not.

**If our model does much better on the training set than on the test set, then we抮e likely overfitting.**

## Preventing overfitting

**1. Cross validation**

**K-Fold Cross validation**

Works accordingly
* Shuffle the dataset randomly.
* Split the dataset into k groups
* For each unique group:
* * Take the group as a hold out or test data set
* * Take the remaining groups as a training data set
* * Fit a model on the training set and evaluate it on the test set
* * Retain the evaluation score and discard the model
* Summarize the skill of the model using the sample of model evaluation scores


In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
sub = pd.read_csv("../input/sample_submission.csv")

In [None]:
train.head()

In [None]:
sub.head()

In [None]:
test.head()

In [None]:
train['target'].value_counts().plot.bar();

In [None]:
X = train.drop(['id', 'target'], axis=1)
y = train['target']

In [None]:
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [None]:
print(X_train.shape,y_train.shape,X_val.shape)

In [None]:
k_fold = KFold(n_splits=10, shuffle=False, random_state=None)

s_fold = RepeatedStratifiedKFold(n_splits=10, n_repeats=20, random_state=42)

for trn, val in s_fold.split(X,y):
    x_train,y_train = X[trn],y[trn]
    x_val,y_val = X[val],y[val]
    
    model = LogisticRegression(C = 0.1, class_weight = 'balanced', penalty ='l1', solver='liblinear')
    model.fit(x_train,y_train)
    
    preds = model.predict(x_val)
scores = cross_val_score(model, X, y, cv=k_fold, n_jobs=1)

In [None]:
scores

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_val, preds)
auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, label='ROC curve (area = %.2f)'%auc)
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)
    

In [None]:
test2 = test.drop(['id'], axis =1)

In [None]:
sub_preds = model.predict(test2)

In [None]:
## 0.676 AUC
submission = pd.DataFrame({
    'id': test['id'],
    'target': sub_preds
})
submission.to_csv("submission.csv", index=False)

## Train with more data

**using SMOTE or ADASYN to Oversample the data**

In [None]:
from imblearn.over_sampling import SMOTE,ADASYN

sm = ADASYN(random_state=42)

X_os, y_os = sm.fit_sample(X,y)

In [None]:
s_fold = RepeatedStratifiedKFold(n_splits=10, n_repeats=20, random_state=42)

for trn, val in s_fold.split(X_os,y_os):
    x_train,y_train = X_os[trn],y_os[trn]
    x_val,y_val = X_os[val],y_os[val]
    
    model = LogisticRegression(C = 0.1, class_weight = 'balanced', penalty ='l1', solver='liblinear')
    model.fit(x_train,y_train)
    
    preds = model.predict(x_val)
scores = cross_val_score(model, X, y, cv=s_fold, n_jobs=1)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, preds)
auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, label='ROC curve (area = %.2f)'%auc)
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

In [None]:
sub_preds = model.predict(test2)

In [None]:
submission = pd.DataFrame({
    'id': test['id'],
    'target': sub_preds
})
submission.to_csv("submission_os.csv", index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier
Model=RandomForestClassifier(max_depth=10)
Model.fit(x_train,y_train)
y_pred=Model.predict(x_val)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_pred)
auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, label='ROC curve (area = %.2f)'%auc)
plt.legend()
plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid(True)

In [None]:
sub_preds = Model.predict(test2)

In [None]:
submission = pd.DataFrame({
    'id': test['id'],
    'target': sub_preds
})
submission.to_csv("submission_RF.csv", index=False)