In [100]:
import scipy
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [92]:
class DataLoader(object):
    def __init__(self,dir='data/'):
        self.rawData=pd.read_csv(dir+'train.csv')
        self.testData=pd.read_csv(dir+'test.csv')
        self.dataTransformMap={
            'COLLEGE':{
                'zero':0,
                'one':1
            },
            'REPORTED_SATISFACTION':{
                'very_unsat':0,
                'unsat':25,
                'avg':50,
                'sat':75,
                'very_sat':100
            },
            'REPORTED_USAGE_LEVEL':{
                'very_little':0,
                'little':1,
                'avg':2,
                'high':3,
                'very_high':4
            },
            'CONSIDERING_CHANGE_OF_PLAN':{
                'considering':0,
                'actively_looking_into_it':1,
                'perhaps':2,
                'no':3,
                'never_thought':4
            }
        }
        
        self.rawData.replace(inplace=True,to_replace=self.dataTransformMap)
        self.testData.replace(inplace=True,to_replace=self.dataTransformMap)
        
#         self.features=list(self.rawData.columns)
        self.features=['COLLEGE',
             'INCOME',
             'OVERAGE',
             'LEFTOVER',
             'HOUSE',
             'HANDSET_PRICE',
             'OVER_15MINS_CALLS_PER_MONTH',
             'AVERAGE_CALL_DURATION',
             'REPORTED_SATISFACTION']
#         self.features.remove('LEAVE')
    def augment(self,data,feature_to_aug=['INCOME','HOUSE']):
        n=len(data)
        for i,f in enumerate(feature_to_aug):
            for r in range(n):
                record1=data.iloc[r,:].copy(deep=True)
                record2=data.iloc[r,:].copy(deep=True)
                record1[f]*=1.013
                record2[f]*=0.987
                data.loc[n*(i+1)+r*2]=record1
                data.loc[n*(i+1)+r*2+1]=record2

        data.to_csv('augedTrainData.csv',index=False)
        return data
    
    def normalize(self,df):
        return (df-df.mean())/df.std()
    
    def split(self,ratio=0.9):
        '''
        split the dataset (should be rawData) into train and val set, will shuffle the dataset each time called
        :param ratio: ratio of train/val, 0.9 by default
        :return: trainX,trainy,valX,valy
        '''
#         print(self.rawData.shape)
        n=self.rawData.__len__()
        self.rawData.sample(frac=1).reset_index(drop=True)
        trainNum=int(n*ratio)
        valNum=n-trainNum
        
        
        
#         try:
#             augedT=pd.read_csv('augedTrainData.csv')
#         except:
#             augedT=self.augment(self.rawData.head(trainNum))
        
        augedT=self.rawData.head(trainNum)
#         tx=self.normalize(augedT)[self.features]
        tx=augedT[self.features]
        ty=augedT['LEAVE']
#         vx=self.normalize(self.rawData.tail(valNum))[self.features]
        vx=self.rawData.tail(valNum)[self.features]
        vy=self.rawData.tail(valNum)['LEAVE']

        return tx,ty,vx,vy

# dl=DataLoader()
# dl.rawData

In [95]:
class Classifier(object):
    def __init__(self):
        self.dataLoader=DataLoader()
        self.trainX,self.trainY,self.valX,self.valY=self.dataLoader.split()
#         self.trainX-=self.trainX.min()
#         self.trainX = SelectKBest(chi2, k=2).fit_transform(self.trainX,self.trainY)
        self.valX=self.valX[self.trainX.columns]
          
    def selectFeature(self,num=2):
        selector=SelectKBest(chi2,k=num)
        selector.fit_transform(self.trainX,self.trainY)
        idxs_selected = selector.get_support(indices=True)
        
        return [self.dataLoader.features[i] for i in idxs_selected]
    
    def searchCV(self):
#         self.selected_feature=self.selectFeature(feat_num)
#         x=self.dataLoader.normalize(self.trainX[self.selected_feature])
        x=self.dataLoader.normalize(self.trainX)
        y=self.trainY
#         y=self.dataLoader.normalize(self.trainY)
        
        
        tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

        scores = ['precision', 'recall']

        for score in scores:
            print("# Tuning hyper-parameters for %s" % score)
            print()

            clf = GridSearchCV(SVC(), tuned_parameters, cv=5,
                               scoring='%s_macro' % score)
            clf.fit(x, y)

            print("Best parameters set found on development set:")
            print()
            print(clf.best_params_)
            print()
            print("Grid scores on development set:")
            print()
            means = clf.cv_results_['mean_test_score']
            stds = clf.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds, clf.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r"
                      % (mean, std * 2, params))
            print()

            print("Detailed classification report:")
            print()
            print("The model is trained on the full development set.")
            print("The scores are computed on the full evaluation set.")
            print()
            valx=self.dataLoader.normalize(self.valX[self.selected_feature])
            y_true, y_pred = y_test, clf.predict(valx)
            print(classification_report(y_true, y_pred))
            print()
        
    def fit(self,feat_num=11): 
        self.selected_feature=self.selectFeature(feat_num)
        x=self.dataLoader.normalize(self.trainX[self.selected_feature])
        y=self.trainY

        self.clf=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
        
        self.clf.fit(x,y)
        
    def predict(self):
        
#         print(x.shape)
        predY=self.clf.predict(x)
        return predY
    def evaluate(self,predY):
        acc=sk.metrics.accuracy_score(self.valY,predY)
        return acc
    def makeTest(self):
        d=self.dataLoader.normalize(self.dataLoader.testData[self.selected_feature])
        predLeave=self.clf.predict(d)
        res=pd.DataFrame({
            'ID':self.dataLoader.testData.index,
            'LEAVE':predLeave
        })
        
        return res

In [None]:
c=Classifier()
c.dataLoader.rawData
# c.valY
c.searchCV()
# res=c.predict()
# acc=c.evaluate(res)
# print(acc)


# Tuning hyper-parameters for precision



In [96]:
r=c.makeTest()
r.to_csv('submit4.csv',index=False)

In [91]:
c.selected_feature

['COLLEGE',
 'INCOME',
 'OVERAGE',
 'LEFTOVER',
 'HOUSE',
 'HANDSET_PRICE',
 'OVER_15MINS_CALLS_PER_MONTH',
 'AVERAGE_CALL_DURATION',
 'REPORTED_SATISFACTION']