In [4]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#from sklearn_pandas import DataFrameMapper
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

In [5]:
data = pd.read_csv('../datasets/sales_loss_win_data.csv')

In [6]:
data.shape

(78025, 19)

## Dimensionality reduction using PCA
- PCA is impacted by the scale of the data so need to scale the continuous features first (mean = 0, variance = 1)

In [7]:
data.head()

Unnamed: 0,Opportunity Number,Supplies Subgroup,Supplies Group,Region,Route To Market,Elapsed Days In Sales Stage,Opportunity Result,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Competitor Type,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,Ratio Days Qualified To Total Days,Deal Size Category
0,1641984,2,0,3,0,76,1,13,104,101,0,5,5,0,2,0.69636,0.113985,0.154215,1
1,1658010,2,0,4,2,63,0,2,163,163,0,3,5,0,2,0.0,1.0,0.0,1
2,1674737,5,2,4,2,24,1,7,82,82,7750,1,1,0,2,1.0,0.0,0.0,1
3,1675224,8,2,1,2,16,0,5,124,124,0,1,1,0,0,1.0,0.0,0.0,1
4,1689785,2,0,4,2,69,0,11,91,13,69756,1,1,0,2,0.0,0.141125,0.0,4


In [8]:
continuous_features = ['Opportunity Number', 'Elapsed Days In Sales Stage', 'Sales Stage Change Count',
                      'Total Days Identified Through Closing', 'Total Days Identified Through Qualified',
                      'Opportunity Amount USD', 'Revenue From Client Past Two Years', 'Ratio Days Identified To Total Days',
                      'Ratio Days Validated To Total Days', 'Ratio Days Qualified To Total Days' ]

In [9]:
data[continuous_features] = StandardScaler().fit_transform(data[continuous_features])

In [10]:
data.head()

Unnamed: 0,Opportunity Number,Supplies Subgroup,Supplies Group,Region,Route To Market,Elapsed Days In Sales Stage,Opportunity Result,Sales Stage Change Count,Total Days Identified Through Closing,Total Days Identified Through Qualified,Opportunity Amount USD,Client Size By Revenue,Client Size By Employee Count,Revenue From Client Past Two Years,Competitor Type,Ratio Days Identified To Total Days,Ratio Days Validated To Total Days,Ratio Days Qualified To Total Days,Deal Size Category
0,-5.698911,2,0,3,0,1.218888,1,6.708555,5.216636,5.113099,-0.688173,5,5,-0.326016,2,1.351564,-0.835417,-0.09061,1
1,-5.683718,2,0,4,2,0.729898,0,-0.638332,8.743344,8.856497,-0.688173,3,5,-0.326016,2,-0.556365,1.141967,-0.543809,1
2,-5.667861,5,2,4,2,-0.737071,1,2.701162,3.901593,3.965928,-0.629973,1,1,-0.326016,2,2.183495,-1.089806,-0.543809,1
3,-5.667399,8,2,1,2,-1.037988,0,1.365364,6.41213,6.501779,-0.688173,1,1,-0.326016,0,2.183495,-1.089806,-0.543809,1
4,-5.653595,2,0,4,2,0.955586,0,5.372757,4.439565,-0.200112,-0.164323,1,1,-0.326016,2,-0.556365,-0.774847,-0.543809,4


In [11]:
data.to_csv('../datasets/dimensionality_reduction_data.csv', index= False)

## PCA

In [12]:
pca = PCA(n_components=15)
x = data.loc[:, data.columns != 'Opportunity Result'].values ## get all the values ready for PCA

principalComponents = pca.fit_transform(x)
pcDf = pd.DataFrame(data = principalComponents
             , columns = ['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10', 'pc11',
                         'pc12', 'pc13', 'pc14', 'pc15'])

In [13]:
result = pd.concat([pcDf, data[['Opportunity Result']]], axis = 1)

In [14]:
result.head()

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,Opportunity Result
0,-2.331817,5.630652,-2.421983,8.963763,4.172707,3.43295,1.694035,-0.15695,-0.897853,-0.145197,0.688405,0.861134,2.826876,1.897395,1.008858,1
1,-2.387092,4.868848,-1.474488,9.506732,7.581599,-0.263801,2.926307,0.135487,-1.253711,1.163462,0.337783,-1.729125,-4.681807,-0.007121,1.071709,0
2,0.781691,1.107016,-0.060734,5.244845,5.077192,2.457572,2.840989,1.983594,0.137841,1.64413,-0.51898,-0.738892,1.278818,0.462194,0.949467,1
3,3.761481,2.080366,-3.08219,5.177122,7.509574,2.821723,3.748106,1.925204,-0.653684,1.604483,-0.800466,-0.785217,-0.876273,-1.786587,0.994171,0
4,-2.378475,3.067172,0.522777,3.867247,3.83391,1.332478,-0.673534,3.2819,0.406988,1.128204,0.736462,1.013347,3.322383,1.491952,-0.132656,0


## pipeline logreg model

In [15]:
yVar = result['Opportunity Result']
xVar = result.loc[:, result.columns != 'Opportunity Result']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(xVar, yVar, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(62420, 15) (62420,)
(15605, 15) (15605,)


In [17]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

## previous model accuracy was 0.77

In [18]:
pipeline = Pipeline(steps = [
        ("features", make_union(
                ColumnSelector(list(xVar)),
                )),
                ("model",LogisticRegression(random_state=42))
])

pipeline.fit(X_train, y_train)

pipeline.score(X_test, y_test)

0.8272348606215957