In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

In [2]:
data_trn = pd.read_csv('train.csv', index_col=0)
data_tst = pd.read_csv('test.csv', index_col=0)

In [3]:
print (data_trn.shape)
print (data_tst.shape)

(25766, 256)
(11042, 255)


Data Engineering

In [4]:
#Read Datasets

recodecustomer={'Old':1,'New':0}
recodenew={'Enable':1,'Not-Enable':0}

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
#print(pd.isnull(train).any())

#fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs)



train['Cust_status']=train['Cust_status'].map(recodecustomer)
train['Trans24']=train['Trans24'].map(recodenew)
train['Trans25']=train['Trans25'].map(recodenew)
train['Trans26']=train['Trans26'].map(recodenew)
train['Trans27']=train['Trans27'].map(recodenew)
print(train['Trans24'].head(2))

y_train=train["Active_Customer"]
train=train.where(pd.notnull(train), train.mean(), axis='columns')
#train=train.fillna(0)
test=test.where(pd.notnull(test), test.mean(), axis='columns')

#train=train.fillna(0)
#test=test.fillna(0)
train=train.drop(["Cust_id"],axis=1)
colsToDrop = ["Promotion5","Promotion6","Promotion7","Promotion8","Promotion9","Promotion10",
                      "Promotion11","Promotion18","Promotion25","Promotion32","Promotion39","Promotion46"
                     ,"Promotion12","Promotion19","Promotion26","Promotion33","Promotion40","Promotion47",
                     "Promotion13","Promotion20","Promotion27","Promotion34","Promotion41","Promotion48",
                     "Promotion14","Promotion21","Promotion28","Promotion35","Promotion42",
                     "Promotion15","Promotion22","Promotion29","Promotion36","Promotion43",
                     "Promotion16","Promotion23","Promotion30","Promotion37","Promotion44",
                     "Promotion17","Promotion24","Promotion31","Promotion38","Promotion45"]
promotions_train = pd.DataFrame(train, columns=colsToDrop)
promotions_test = pd.DataFrame(test, columns=colsToDrop)

train=train.drop(["Promotion5","Promotion6","Promotion7","Promotion8","Promotion9","Promotion10",
                     "Promotion11","Promotion18","Promotion25","Promotion32","Promotion39","Promotion46"
                     ,"Promotion12","Promotion19","Promotion26","Promotion33","Promotion40","Promotion47",
                     "Promotion13","Promotion20","Promotion27","Promotion34","Promotion41","Promotion48",
                     "Promotion14","Promotion21","Promotion28","Promotion35","Promotion42",
                     "Promotion15","Promotion22","Promotion29","Promotion36","Promotion43",
                     "Promotion16","Promotion23","Promotion30","Promotion37","Promotion44",
                     "Promotion17","Promotion24","Promotion31","Promotion38","Promotion45"],axis=1)
test=test.drop(["Promotion5","Promotion6","Promotion7","Promotion8","Promotion9","Promotion10",
                     "Promotion11","Promotion18","Promotion25","Promotion32","Promotion39","Promotion46"
                     ,"Promotion12","Promotion19","Promotion26","Promotion33","Promotion40","Promotion47",
                     "Promotion13","Promotion20","Promotion27","Promotion34","Promotion41","Promotion48",
                     "Promotion14","Promotion21","Promotion28","Promotion35","Promotion42",
                     "Promotion15","Promotion22","Promotion29","Promotion36","Promotion43",
                     "Promotion16","Promotion23","Promotion30","Promotion37","Promotion44",
                     "Promotion17","Promotion24","Promotion31","Promotion38","Promotion45"],axis=1)
train=train.drop(["Active_Customer"],axis=1)


test['Cust_status']=test['Cust_status'].map(recodecustomer)
test['Trans24']=test['Trans24'].map(recodenew)
test['Trans25']=test['Trans25'].map(recodenew)
test['Trans26']=test['Trans26'].map(recodenew)
test['Trans27']=test['Trans27'].map(recodenew)

y_id = test["Cust_id"]
#test = test.drop(["Cust_id"],axis=1)
#test=test.drop(["Promotion5"],axis=1)
#test=test.drop(["Promotion6"],axis=1)
#test=test.drop(["Promotion7"],axis=1)
#test=test.drop(["Promotion8"],axis=1)



0    1
1    1
Name: Trans24, dtype: int64


Performing PCA on Promotions data

In [7]:
from sklearn import decomposition
#Assumed you have training and test data set as train and test
# Create PCA obeject 
pca= decomposition.PCA(n_components=1) #default value of k =min(n_sample, n_features)
# For Factor analysis
#fa= decomposition.FactorAnalysis()
# Reduced the dimension of training dataset using PCA
from sklearn.preprocessing import normalize
promotions_train_new=normalize(promotions_train, norm='l2', axis=1, copy=True)

promotions_train_reduced = pca.fit_transform(promotions_train_new)
train['reduced_columns']=promotions_train_reduced
promotions_test_new=normalize(promotions_test, norm='l2', axis=1, copy=True)

promotions_test_reduced = pca.fit_transform(promotions_test_new)
test['reduced_columns']=promotions_test_reduced

In [9]:
X_train,X_test,Y_train,Y_test=train_test_split(train, y_train, test_size=0.33, random_state=42)
#for original train test data
#train_matrix = train.as_matrix()
#test_matrix=test.as_matrix()

y_train_matrix = Y_train.as_matrix()

#for test train split
train_matrix= X_train.as_matrix()
test_matrix = X_test.as_matrix()




#pd.DataFrame.to_csv(train,'data2.csv')
#print(train.head())
print train_matrix.shape
print test_matrix.shape
print y_train_matrix.shape
#print promotions.shape
#print(pd.isnull(train).any())

(17263, 212)
(8503, 212)
(17263,)


Using Logistic Regression

In [10]:
logistic = LogisticRegression(class_weight = "balanced")
logistic.fit(train_matrix,y_train_matrix)
y_test = logistic.predict(test_matrix)
train_test = logistic.predict(train_matrix)


In [11]:
#preds = pd.DataFrame({"Cust_id":y_id,"Active_Customer":y_test})
#preds = preds.set_index('Cust_id')
#preds.to_csv('first.csv')


accuracy_score(Y_test,y_test)

0.66270727978360577

Using Xgboost classifier

In [13]:
gbm1 = xgb.XGBClassifier(max_depth=5, n_estimators=600, learning_rate=0.025).fit(train_matrix, y_train_matrix)

In [14]:
predictions = gbm1.predict(test_matrix)
train_predictions = gbm1.predict(train_matrix)
print("testin accuracy",accuracy_score(Y_test,predictions))
print("training accuracy",accuracy_score(Y_train,train_predictions))


('testin accuracy', 0.67611431259555455)
('training accuracy', 0.77425708161964901)


In [99]:
preds = pd.DataFrame({"Cust_id":y_id,"Active_Customer":predictions})
preds = preds.set_index('Cust_id')
preds.to_csv('third.csv')

Using Random Forest Classifier

In [70]:
from sklearn.ensemble import RandomForestClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create Random Forest object
model_RF= RandomForestClassifier(criterion="entropy")
# Train the model using the training sets and check score
model_RF.fit(train_matrix,y_train_matrix)
#Predict Output
predicted_randomforest= model_RF.predict(test_matrix)

Doing PCA

In [134]:
from sklearn import decomposition
#Assumed you have training and test data set as train and test
# Create PCA obeject 
pca= decomposition.PCA(n_components=20) #default value of k =min(n_sample, n_features)
# For Factor analysis
#fa= decomposition.FactorAnalysis()
# Reduced the dimension of training dataset using PCA
from sklearn.preprocessing import normalize
train_matrix_nor=normalize(train_matrix, norm='l2', axis=1, copy=True)
test_matrix_nor=normalize(test_matrix, norm='l2', axis=1, copy=True)
train_reduced = pca.fit_transform(train_matrix_nor)
#Reduced the dimension of test dataset
test_reduced = pca.transform(test_matrix_nor)

In [135]:
#Import Library
from sklearn import svm
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create SVM classification object 
model = svm.SVC() 
# there is various option associated with it, this is simple for classification. You can refer link, for mo# re detail.
# Train the model using the training sets and check score
model.fit(train_reduced,y_train_matrix)
#model.score(X, y)
#Predict Output
predicted= model.predict(test_reduced)

In [136]:
accuracy_score(Y_test,predicted)

0.64647771374808893

Using XGBoost after PCA

In [137]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_reduced, y_train_matrix)
predictions = gbm.predict(test_reduced)
accuracy_score(Y_test,predictions)

0.66329530753851584

In [139]:
logistic1 = LogisticRegression(class_weight = "balanced")
logistic1.fit(train_reduced,y_train_matrix)
y_test = logistic1.predict(test_reduced)

accuracy_score(Y_test,y_test)

0.65224038574620724

In [140]:
from sklearn.ensemble import RandomForestClassifier
#Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
# Create Random Forest object
model1= RandomForestClassifier(criterion="entropy")
# Train the model using the training sets and check score
model1.fit(train_reduced,y_train_matrix)
#Predict Output
predicted_randomforest= model1.predict(test_reduced)

In [141]:
accuracy_score(Y_test,predicted_randomforest)

0.63271786428319421

Voting Classifier

In [144]:

from sklearn.ensemble import VotingClassifier
eclf1 = VotingClassifier(estimators=[('GB', gbm1), ('LR', logistic),('SVM',model)], voting='hard')
eclf1 = eclf1.fit(train_matrix,y_train_matrix)
predicted_ensemble=eclf1.predict(test_matrix)
predicted_train_ensemble=eclf1.predict(train_matrix)

In [145]:
print accuracy_score(Y_test,predicted_ensemble)
print accuracy_score(Y_train,predicted_train_ensemble)

0.659179113254
0.665295719168
