In [1]:
import scipy
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [61]:
class DataLoader(object):
    def __init__(self,dir='data/'):
        self.rawData=pd.read_csv(dir+'train.csv')
        self.testData=pd.read_csv(dir+'test.csv')
        self.dataTransformMap={
            'COLLEGE':{
                'zero':0,
                'one':1
            },
            'REPORTED_SATISFACTION':{
                'very_unsat':0,
                'unsat':25,
                'avg':50,
                'sat':75,
                'very_sat':100
            },
            'REPORTED_USAGE_LEVEL':{
                'very_little':0,
                'little':1,
                'avg':2,
                'high':3,
                'very_high':4
            },
            'CONSIDERING_CHANGE_OF_PLAN':{
                'considering':0,
                'actively_looking_into_it':1,
                'perhaps':2,
                'no':3,
                'never_thought':4
            }
        }
        
        self.rawData.replace(inplace=True,to_replace=self.dataTransformMap)
        self.testData.replace(inplace=True,to_replace=self.dataTransformMap)
        
        self.features=list(self.rawData.columns)
#         self.features.remove('LEAVE')
        self.features=['COLLEGE',
                     'INCOME',
                     'OVERAGE',
                     'LEFTOVER',
                     'HOUSE',
                     'HANDSET_PRICE',
                     'OVER_15MINS_CALLS_PER_MONTH',
                     'AVERAGE_CALL_DURATION',
                     'REPORTED_SATISFACTION']

    def augment(self,data,feature_to_aug=['INCOME','HOUSE']):
        n=len(data)
        for i,f in enumerate(feature_to_aug):
            for r in range(n):
                record1=data.iloc[r,:].copy(deep=True)
                record2=data.iloc[r,:].copy(deep=True)
                record1[f]*=1.0013
                record2[f]*=0.9987
                data.loc[n*(i+1)+r*2]=record1
                data.loc[n*(i+1)+r*2+1]=record2

        data.to_csv('augedTrainData.csv',index=False)
        return data
    
    def normalize(self,df):
        return (df-df.mean())/df.std()
    
    def manip(self,tx):
#         print('maniping')
#         tx.INCOME=self.normalize(tx.INCOME)
#         tx.HOUSE=self.normalize(tx.HOUSE)
#         tx.HANDSET_PRICE==self.normalize(tx.HANDSET_PRICE)
        return self.normalize(tx)
    
    def nosplit(self):
        self.rawData=self.augment(self.rawData)
        x=self.rawData[self.features]
        y=self.rawData['LEAVE']

        return x,y
    
    def split(self,ratio=0.9):
        '''
        split the dataset (should be rawData) into train and val set, will shuffle the dataset each time called
        :param ratio: ratio of train/val, 0.9 by default
        :return: trainX,trainy,valX,valy
        '''

        n=self.rawData.__len__()
        self.rawData.sample(frac=1).reset_index(drop=True)
        trainNum=int(n*ratio)
        valNum=n-trainNum
        
        self.rawData=self.augment(self.rawData)
        augedT=self.rawData.head(trainNum)
        tx=augedT[self.features]
        ty=augedT['LEAVE']

        vx=self.rawData.tail(valNum)[self.features]
        vy=self.rawData.tail(valNum)['LEAVE']

        return tx,ty,vx,vy

In [36]:
dl=DataLoader()
x,y=dl.nosplit()
tx=dl.manip(tx)
tx

Unnamed: 0,COLLEGE,INCOME,OVERAGE,LEFTOVER,HOUSE,HANDSET_PRICE,OVER_15MINS_CALLS_PER_MONTH,AVERAGE_CALL_DURATION,REPORTED_SATISFACTION
0,-1.003680,-1.158746,-1.001651,-0.662601,-0.708066,-1.073888,-0.897275,-0.459340,-0.349152
1,0.996272,-1.058212,-1.001651,-0.401533,1.223074,-0.685786,-0.897275,-0.005781,-0.349152
2,0.996272,-1.270930,1.671673,-0.886373,-0.741079,-0.886851,0.894883,2.035233,-0.349152
3,-1.003680,0.953500,-0.559972,0.344375,1.174118,1.820511,-0.561245,-0.912899,-0.349152
4,0.996272,-1.224379,1.415963,2.283735,-1.059224,-0.699814,1.454932,-1.139679,-0.961434
5,-1.003680,1.280894,-0.257770,0.903805,0.558693,1.100418,-0.561245,-0.912899,-0.349152
6,-1.003680,-0.916664,1.601934,-0.886373,0.816253,-0.933610,0.222824,-0.232561,-0.961434
7,0.996272,0.106703,-1.001651,-0.140465,0.777207,-0.157406,-0.897275,-0.232561,-0.961434
8,-1.003680,-1.009695,-1.001651,-0.625305,-0.863284,-0.938286,-0.897275,-0.232561,1.487695
9,-1.003680,0.612010,1.020776,-0.215056,-1.341534,1.385650,1.902971,-0.459340,1.487695


In [54]:
class Classifier(object):
    def __init__(self):
        self.dataLoader=DataLoader()
        self.trainX,self.trainY=self.dataLoader.nosplit()
        self.trainX=self.dataLoader.normalize(self.trainX)
#         self.valX=self.valX[self.trainX.columns]
          
    def selectFeature(self,num=2):
        selector=SelectKBest(chi2,k=num)
        selector.fit_transform(self.trainX,self.trainY)
        idxs_selected = selector.get_support(indices=True)
        
        return [self.dataLoader.features[i] for i in idxs_selected]
    
    def searchCV(self):
        x=self.dataLoader.normalize(self.trainX)
        y=self.trainY
        
        
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

        scores = ['precision', 'recall']

        for score in scores:
            print("# Tuning hyper-parameters for %s" % score)
            print()

            clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                               scoring='%s_macro' % score)
            clf.fit(x, y)

            print("Best parameters set found on development set:")
            print()
            print(clf.best_params_)
            print()
            print("Grid scores on development set:")
            print()
            means = clf.cv_results_['mean_test_score']
            stds = clf.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds, clf.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r"
                      % (mean, std * 2, params))
            print()

            print("Detailed classification report:")
            print()
            print("The model is trained on the full development set.")
            print("The scores are computed on the full evaluation set.")
            print()
            y_true, y_pred = y_test, clf.predict(valx)
            print(classification_report(y_true, y_pred))
            print()
        
        
    def multiFit(self):
        x=self.dataLoader.normalize(self.trainX)
        y=self.trainY
        self.clfs=[SVC(),
        KNeighborsClassifier(n_neighbors=200),
        RandomForestClassifier(max_depth=100, random_state=0),
        DecisionTreeClassifier(random_state=0)]
        for c in self.clfs:
            c.fit(x,y)
    def multiPredit(self):
        self.valX=self.dataLoader.normalize(self.valX)
        res=np.zeros(self.valX.shape)
        for c in self.clfs:
            _=c.predict_proba(self.valX)
            _=_[range(self.valX.shape[0]),self.valX]
            res+=_
        return res
        
    def fit(self,t='svc'): 
        x=self.dataLoader.manip(self.trainX)
        y=self.trainY
        if t=='svc':
            self.clf=SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
        max_iter=-1, probability=True, random_state=None, shrinking=True,
        tol=0.001, verbose=False)
        elif t=='knn':
            self.clf=KNeighborsClassifier(n_neighbors=200)
        elif t=='tree':
            self.clf=DecisionTreeClassifier(random_state=0)
        elif t=='rf':
            self.clf=RandomForestClassifier(max_depth=100, random_state=0)   
        self.clf.fit(x,y)
    def predict(self):
        self.valX=self.dataLoader.manip(self.valX)
        predY=self.clf.predict(self.valX)
        return predY
    def evaluate(self,predY):
        acc=sk.metrics.accuracy_score(self.valY,predY)
        return acc
    def makeTest(self):
        d=self.dataLoader.normalize(self.dataLoader.testData[self.dataLoader.features])
        predLeave=self.clf.predict(d)
        res=pd.DataFrame({
            'ID':self.dataLoader.testData.index,
            'LEAVE':predLeave
        })
        
        return res

In [62]:
c=Classifier()
c.fit('svc')

In [63]:
r=c.makeTest()
r.to_csv('submit.csv',index=False)