In [1]:
import gc
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
path = "./data/"
train_file = "filtered_train.csv"
trainfull = pd.read_csv(train_file)

In [3]:
uniquenumber = pd.DataFrame(trainfull.nunique())
uniquenumber.sample()

Unnamed: 0,0
app_domain,143


In [4]:
train70 = uniquenumber[uniquenumber[0]<=70]
indexs = train70.index
traindrop = trainfull[indexs]
pos_trainDf = traindrop[traindrop['click'] == 1]
neg_trainDf = traindrop[traindrop['click'] == 0].sample(n=20000, random_state=2020)
trainDf = pd.concat([pos_trainDf, neg_trainDf], axis=0).sample(frac=1.0, random_state=2020)
del pos_trainDf; del neg_trainDf; gc.collect();

print(trainDf.shape, trainDf['click'].mean())

(88655, 12) 0.7744064068580452


In [5]:
trainDf, testDf, _, _ = train_test_split(trainDf, trainDf['click'], test_size=0.25, random_state=2018)

print(trainDf['click'].mean(), trainDf.shape)
print(testDf['click'].mean(), testDf.shape)

0.7741950038351055 (66491, 12)
0.7750406063887385 (22164, 12)


In [6]:
x_train = trainDf.drop(['click'], axis=1)
y_train = trainDf['click']
x_test = testDf.drop(['click'], axis=1)
y_test = testDf['click']
x_total = pd.concat([x_train, x_test]).reset_index()
cols = list(x_train.columns)

In [7]:
train_sz = x_train.shape
train_sz

(66491, 11)

In [8]:
train_sz = x_train.shape[0]


for col in x_train.columns:
    onehotret = pd.get_dummies(x_total[col], prefix=col)
    x_total = pd.concat([x_total, onehotret], axis=1)
    
x_train = x_total[:train_sz]
x_test = x_total[train_sz:]
x_train = x_train.drop(columns=cols)
x_test = x_test.drop(columns=cols)
x_train = x_train.drop(columns=['index'])
x_test = x_test.drop(columns=['index'])
x_total = x_total.drop(columns=cols)
x_total = x_total.drop(columns=['index'])

### Use Logistic Regression for classification directly

In [9]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'alpha':[0.01,0.1,1,10,50]}
ridge = RidgeClassifier()
clf_ridge = GridSearchCV(ridge, parameters,cv=5,scoring='roc_auc')
clf_ridge.fit(x_train, y_train)
print('Ridge best score:',clf_ridge.best_score_)
print('Ridge best parameters:',clf_ridge.best_params_) 

lasso = LogisticRegression(penalty='l1',solver='liblinear')
clf_lasso = GridSearchCV(lasso,{},cv=5,scoring='roc_auc')
clf_lasso.fit(x_train, y_train)
print('Lasso best score:',clf_lasso.best_score_) 

Ridge best score: 0.67833937325081
Ridge best parameters: {'alpha': 10}
Lasso best score: 0.6796673145883362


In [10]:
from sklearn.metrics import accuracy_score,auc,log_loss,roc_curve,roc_auc_score

ridge_best = RidgeClassifier(alpha = 10)
ridge_best.fit(x_train,y_train)
predict_y = ridge_best.predict(x_test)
print('auc score on test ridge:',roc_auc_score(predict_y,y_test))

auc score on test ridge: 0.7058803611738149


In [11]:
predict_y_2 = clf_lasso.predict(x_test)
print('auc score on test lasso:',roc_auc_score(predict_y_2,y_test))

auc score on test lasso: 0.7036161819652365


### Random Forest Tree

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [None]:
param_grid2 = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [14]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(x_train,y_train)
print(CV_rfc.best_params_)

{'max_features': 'log2', 'n_estimators': 700}


In [16]:
rfc_best=RandomForestClassifier(max_features='log2', 
                            n_estimators= 700)
rfc_best.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=700,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [17]:
from sklearn.metrics import accuracy_score,auc,log_loss,roc_curve,roc_auc_score
predict_y_rf = rfc_best.predict(x_test)
predict_train_rf = rfc_best.predict(x_train)
print('auc score on test:',roc_auc_score(predict_y_rf,y_test))
print('auc score on train:',roc_auc_score(predict_train_rf,y_train))

auc score on test: 0.7006388924770206
auc score on train: 0.7563298317513317


### Gradient Boosted Tree

In [13]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,auc,log_loss,roc_curve,roc_auc_score

gbml = GradientBoostingClassifier(n_estimators=50, random_state=10, subsample=0.6, max_depth=10, learning_rate = 0.1)
gbml.fit(x_train, y_train)
predict_y_gbm = gbml.predict(x_test)
predict_train = gbml.predict(x_train)
print('auc score on test:',roc_auc_score(predict_y_gbm,y_test))
print('auc score on train:',roc_auc_score(predict_train,y_train))

auc score on test: 0.7270987933139912
auc score on train: 0.7753534764943496


### GBDT + LR

In [15]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()

train_new_feature = gbml.apply(x_train)
train_new_feature = train_new_feature.reshape(-1, 50)
enc.fit(train_new_feature)

train_new_feature2 = np.array(enc.transform(train_new_feature).toarray())

In [16]:
test_new_feature = gbml.apply(x_test)
test_new_feature = test_new_feature.reshape(-1, 50)
test_new_feature2 = np.array(enc.transform(test_new_feature).toarray())

In [20]:
clf_lasso.fit(train_new_feature2, y_train)

GridSearchCV(cv=5,
             estimator=LogisticRegression(penalty='l1', solver='liblinear'),
             param_grid={}, scoring='roc_auc')

In [24]:
ridge_best.fit(train_new_feature2, y_train)

RidgeClassifier(alpha=10)

In [25]:
predict_y = clf_lasso.predict(test_new_feature2)
print('auc score on test lasso:',roc_auc_score(predict_y,y_test))

predict_y = ridge_best.predict(test_new_feature2)
print('auc score on test ridge:',roc_auc_score(predict_y,y_test))

auc score on test lasso: 0.7105855624747645
auc score on test ridge: 0.7069014120229666


In [27]:
predict_y = clf_lasso.predict(train_new_feature2)
print('auc score on train lasso:',roc_auc_score(predict_y,y_train))

auc score on train lasso: 0.7425609437491809
