**In this competition, you’re challenged to build a model that predicts the probability that a driver will initiate an auto insurance claim in the next year. While Porto Seguro has used machine learning for the past 20 years, they’re looking to Kaggle’s machine learning community to explore new, more powerful methods. A more accurate prediction will allow them to further tailor their prices, and hopefully make auto insurance coverage more accessible to more drivers.**

**Contents **

**1.Exploration**

**2.Dimensionality Reduction Overview**

**3.:Feature Extraction**

**4.Model Preparation**

**5.Submission**




In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
train=pd.read_csv('../input/train.csv')
test=pd.read_csv('../input/test.csv')

In [None]:
train_cols=[str(i) for i in train.columns]
test_cols=[str(i) for i in test.columns]

In [None]:
from sklearn.preprocessing import Imputer
imr=Imputer(missing_values=-1,strategy='median',axis=0)
imrX= imr.fit(train)
train = imrX.transform(train)

In [None]:
imrX= imr.fit(test)
test = imrX.transform(test)


In [None]:
train=pd.DataFrame(train,columns=train_cols)
test=pd.DataFrame(test,columns=test_cols)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=20,svd_solver='full', random_state=1996)
pca.fit(train)

In [None]:
print('Explained variance: %.4f' % pca.explained_variance_ratio_.sum())

print('Explained variance: %.4f' % pca.explained_variance_ratio_.sum())
print('Individual variance contributions:')
for j in range(pca.n_components):
    print(pca.explained_variance_ratio_[j])

In [None]:
cat=[]
for i in train.columns:
    if 'cat' in i:
        cat.append(i)
        
bina=[]
for i in train.columns:
    if 'bin' in i:
        bina.append(i)
        
reg=[]
for i in train.columns:
    if 'reg' in i:
        reg.append(i)
        
ind=[]
for i in train.columns:
    if 'ind' in i and 'bin' not in i and 'cat' not in i:
        ind.append(i)
        
car=[]
for i in train.columns:
    if 'car' in i and 'bin' not in i and 'cat' not in i:
        car.append(i)
        
calc=[]
for i in train.columns:
    if 'calc' in i and 'bin' not in i and 'cat' not in i:
        calc.append(i)

In [None]:
cols=[reg,bina,ind,calc,car,cat]
print(train.columns,train.shape)
print(sum([len(i) for i in [reg,bina,ind,calc,car,cat]]))
##plt.figure()
##plt.subplot(4,1,1)
##sns.countplot(x='Survived',data=train,hue='family')
##plt.subplot(4,1,2)
##sns.barplot(x="family", y="Survived", hue="Sex", data=train);
##plt.subplot(4,1,3)
##sns.barplot(x='family',y='Survived',data=train)
##plt.subplot(4,1,4)
##sns.countplot(x='family',hue='Survived',data=train)
##plt.show()

In [None]:
train_data=pd.DataFrame()
test_data=pd.DataFrame()
train_data['target']=train['target']

In [None]:
for i in calc:
    train_data[i+"_transform"]=pd.DataFrame(np.power(train[i],.5))
    test_data[i+"_transform"]=pd.DataFrame(np.power(test[i],.5))
    print(train[[i,'target']].corr()['target'],'\n',train_data[[i+"_transform",'target']].corr()['target'])
    #sns.distplot(train[i])
    #plt.show()
    #sns.distplot(np.power(train[i],.5))
    #plt.show()

In [None]:
#ps_calc_05,ps_calc_09,ps_calc_11 after transformation seems to loose its previous correlation 
#with target , so rollback their transformation
train_data=train_data.drop(['ps_calc_05_transform','ps_calc_09_transform','ps_calc_11_transform'],1)
train_data['ps_calc_05']=train['ps_calc_05']
train_data['ps_calc_09']=train['ps_calc_09']
train_data['ps_calc_11']=train['ps_calc_11']

test_data=test_data.drop(['ps_calc_05_transform','ps_calc_09_transform','ps_calc_11_transform'],1)
test_data['ps_calc_05']=test['ps_calc_05']
test_data['ps_calc_09']=test['ps_calc_09']
test_data['ps_calc_11']=test['ps_calc_11']

In [None]:
for i in car:
    train_data[i+"_transform"]=pd.DataFrame(np.power(train[i],.5))
    test_data[i+"_transform"]=pd.DataFrame(np.power(test[i],.5))
    print(train[[i,'target']].corr()['target'],'\n',train_data[[i+"_transform",'target']].corr()['target'])
    # sns.distplot(train[i])
    # plt.show()
    # sns.distplot(np.power(train[i],.5))
    # plt.show()   

In [None]:
#ps_car_15 after transformation seems to loose its previous correlation 
#with target , so rollback its transformation
train_data=train_data.drop(['ps_car_15_transform'],1)
train_data['ps_car_15']=train['ps_car_15']

test_data=test_data.drop(['ps_car_15_transform'],1)
test_data['ps_car_15']=test['ps_car_15']

In [None]:
for i in cat:
    train_data[i+"_transform"]=pd.DataFrame(np.power(train[i],.5))
    test_data[i+"_transform"]=pd.DataFrame(np.power(test[i],.5))
    print(train[[i,'target']].corr()['target'],'\n',train_data[[i+"_transform",'target']].corr()['target'])
    # sns.distplot(train[i])
    # plt.show()
    # sns.distplot(np.power(train[i],.5))
    # plt.show() 

In [None]:
#ps_car_01_cat,ps_car_11_cat after transformation seems to loose its previous correlation 
#with target , so rollback their transformation
train_data=train_data.drop(['ps_car_01_cat_transform','ps_car_11_cat_transform'],1)
train_data['ps_car_01_cat']=train['ps_car_01_cat']
train_data['ps_car_11_cat']=train['ps_car_11_cat']

test_data=test_data.drop(['ps_car_01_cat_transform','ps_car_11_cat_transform'],1)
test_data['ps_car_01_cat']=test['ps_car_01_cat']
test_data['ps_car_11_cat']=test['ps_car_11_cat']


In [None]:
for i in ind:
    train_data[i+"_transform"]=pd.DataFrame(np.power(train[i],.5))
    test_data[i+"_transform"]=pd.DataFrame(np.power(test[i],.5))
    print(train[[i,'target']].corr()['target'],'\n',train_data[[i+"_transform",'target']].corr()['target'])
    # sns.distplot(train[i])
    # plt.show()
    # sns.distplot(np.power(train[i],.5))
    # plt.show()   

In [None]:
#ps_ind_01,ps_ind_03,ps_ind_15 after transformation seems to loose its previous correlation 
#with target , so rollback their transformation
train_data=train_data.drop(['ps_ind_01_transform','ps_ind_03_transform','ps_ind_15_transform'],1)
train_data['ps_ind_01']=train['ps_ind_01']
train_data['ps_ind_03']=train['ps_ind_03']
train_data['ps_ind_15']=train['ps_ind_03']

test_data=test_data.drop(['ps_ind_01_transform','ps_ind_03_transform','ps_ind_15_transform'],1)
test_data['ps_ind_01']=test['ps_ind_01']
test_data['ps_ind_03']=test['ps_ind_03']
test_data['ps_ind_15']=test['ps_ind_03']

In [None]:
for i in reg:
    train_data[i+"_transform"]=pd.DataFrame(np.power(train[i],.5))
    test_data[i+"_transform"]=pd.DataFrame(np.power(test[i],.5))
    print(train[[i,'target']].corr()['target'],'\n',train_data[[i+"_transform",'target']].corr()['target'])
    # sns.distplot(train[i])
    # plt.show()
    # sns.distplot(np.power(train[i],.5))
    # plt.show()   

In [None]:
#ps_reg_01,ps_reg_02 after transformation seems to loose its previous correlation 
#with target , so rollback their transformation
train_data=train_data.drop(['ps_reg_01_transform','ps_reg_02_transform'],1)
train_data['ps_reg_01']=train['ps_ind_01']
train_data['ps_reg_02']=train['ps_reg_02']

test_data=test_data.drop(['ps_reg_01_transform','ps_reg_02_transform'],1)
test_data['ps_reg_01']=test['ps_ind_01']
test_data['ps_reg_02']=test['ps_reg_02']

In [None]:
for i in bina:
    train_data[i]=train[i]
    test_data[i]=test[i]

In [None]:
train_data=train_data.drop(['target'],1)

print(train_data.shape,test_data.shape)
print(train_data.columns,test_data.columns)

In [None]:
pca1=PCA(n_components=45,svd_solver='full',random_state=1996)
pca1.fit(train_data)

In [None]:
print('Explained variance: %.4f' % pca1.explained_variance_ratio_.sum())
print('Individual variance contributions:')
for j in range(pca1.n_components):
    print(pca1.explained_variance_ratio_[j])

**SOME MORE FEATURES FROM NEAREST NEIGHBORS**

In [None]:
cat_new=[]
for i in train_data.columns:
    if 'cat' in i:
        cat_new.append(i)
        
bina_new=[]
for i in train_data.columns:
    if 'bin' in i:
        bina_new.append(i)
        
reg_new=[]
for i in train_data.columns:
    if 'reg' in i:
        reg_new.append(i)
        
ind_new=[]
for i in train_data.columns:
    if 'ind' in i and 'bin' not in i and 'cat' not in i:
        ind_new.append(i)
        
car_new=[]
for i in train_data.columns:
    if 'car' in i and 'bin' not in i and 'cat' not in i:
        car_new.append(i)
        
calc_new=[]
for i in train_data.columns:
    if 'calc' in i and 'bin' not in i and 'cat' not in i:
        calc_new.append(i)

In [None]:
cols_new=[reg_new,bina_new,ind_new,calc_new,car_new,cat_new]
print(sum([len(i) for i in [reg_new,bina_new,ind_new,calc_new,car_new,cat_new]]))

In [None]:
from sklearn.cluster import MiniBatchKMeans

In [None]:
new_cols=['reg_cluster','bina_cluster','ind_cluster','calc_cluster','car_cluster','cat_cluster']

In [None]:
print(train_data.shape,test_data.shape)

In [None]:
train_clusters=pd.DataFrame()
test_clusters=pd.DataFrame()
cnt=0
total_data=pd.concat([train_data,test_data])
print(len(total_data)==(len(train_data)+len(test_data)))
for i in new_cols:
    coords = np.vstack((total_data[[str(j) for j in cols_new[cnt]]].values))
    sample_ind = np.random.permutation(len(coords))
    kmeans = MiniBatchKMeans(n_clusters=5, batch_size=10000).fit(coords[sample_ind])
    train_clusters[str(i)]=kmeans.predict(train_data[[str(j) for j in cols_new[cnt]]].values)
    print(len(train_clusters[str(i)]),max(train_clusters[str(i)].values),min(train_clusters[str(i)].values))
    cnt+=1
cnt=0
for i in new_cols:
    coords = np.vstack((total_data[[str(j) for j in cols_new[cnt]]].values))
    sample_ind = np.random.permutation(len(coords))
    kmeans = MiniBatchKMeans(n_clusters=5, batch_size=10000).fit(coords[sample_ind])
    test_clusters[str(i)]=kmeans.predict(test_data[[str(j) for j in cols_new[cnt]]].values)
    print(len(test_clusters['reg_cluster']),max(test_clusters['reg_cluster'].values),min(test_clusters['reg_cluster'].values))
    cnt+=1    
print(train_clusters.shape,test_clusters.shape)    

In [None]:
train_clusters['target']=train['target']
for i in [i for i in train_clusters.columns]:
    train_clusters[i+"_transform"]=pd.DataFrame(np.power(train_clusters[i],.5))
    #test_clusters[i+"_transform"]=pd.DataFrame(np.sinh(test_clusters[i]))
    print(train_clusters[[i,'target']].corr()['target'],'\n',train_clusters[[i+"_transform",'target']].corr()['target'])
    train_clusters=train_clusters.drop([i+"_transform"],1)

In [None]:
train_clusters["reg_cluster_transform"]=pd.DataFrame(np.sin(train_clusters['reg_cluster']))
train_clusters["bina_cluster_transform"]=pd.DataFrame(np.sin(train_clusters['bina_cluster']))
train_clusters["car_cluster_transform"]=pd.DataFrame(np.sin(train_clusters['car_cluster']))
train_clusters=train_clusters.drop(['reg_cluster'],1)
train_clusters=train_clusters.drop(['bina_cluster'],1)
train_clusters=train_clusters.drop(['car_cluster'],1)
train_clusters=train_clusters.drop(['target'],1)

test_clusters["reg_cluster_transform"]=pd.DataFrame(np.sin(test_clusters['reg_cluster']))
test_clusters["bina_cluster_transform"]=pd.DataFrame(np.sin(test_clusters['bina_cluster']))
test_clusters["car_cluster_transform"]=pd.DataFrame(np.sin(test_clusters['car_cluster']))
test_clusters=test_clusters.drop(['reg_cluster'],1)
test_clusters=test_clusters.drop(['bina_cluster'],1)
test_clusters=test_clusters.drop(['car_cluster'],1)


In [None]:
print(train_clusters.shape,test_clusters.shape)
print(train_clusters.columns,test_clusters.columns)
print(train_data.shape,test_data.shape)

In [None]:
final_train_data=pd.concat([train_data,train_clusters],axis=1)
final_test_data=pd.concat([test_data,test_clusters],axis=1)

In [None]:
print(final_train_data.shape)
print(final_test_data.shape)
print(final_train_data.columns)
print(final_test_data.columns)

In [None]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [None]:
from sklearn.cross_validation import train_test_split


In [None]:
x_tr,x_ts,y_tr,y_ts=train_test_split(final_train_data,train['target'],test_size=.2)

In [None]:
import xgboost as xgb


In [None]:
d_train = xgb.DMatrix(x_tr, y_tr)
d_valid = xgb.DMatrix(x_ts, y_ts)

In [None]:
d_test = xgb.DMatrix(final_test_data)

In [None]:
# Set xgboost parameters
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.2
params['silent'] = True
params['max_depth'] = 6
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9
params['nthread']=-1
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [None]:
model = xgb.train(params, d_train, 20, watchlist, early_stopping_rounds=90, feval=gini_xgb, 
                maximize=True, verbose_eval=10)

In [None]:
print( "  Best N trees = ", model.best_ntree_limit )
print( "  Best gini = ", model.best_score )

In [None]:
p_test = model.predict(d_test)

print('Training completed')

# Create a submission file
sub = pd.DataFrame()
sub['id'] = test['id'].values
sub['target'] = p_test
sub.to_csv('xgb_02_nov.csv', index=False)

print('All done')

**If You Found my Effort usfull for you, Please Upvote it.   :) **
**More To Come , Stay Tunned**