In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import os, errno
import re
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.externals import joblib

base = '../../'

raw_data = base + 'Data/Clean/Survey/ResponseData.csv'
raw_data_stata = base + 'Data/Clean/Survey/ResponseData.dta'

# dataalt = pd.read_csv(raw_data)
data = pd.read_stata(raw_data_stata, order_categoricals = True)

In [42]:
y = data['Finished']

cols = [ x for x in data.columns.tolist() if re.match('^.*Dummy', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^LegalType_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^CompanyProfile_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^ConnectType_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^PrimaryIndustry_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^TimeGroups_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^Strata_[0-9]+', x)]
cols = cols + ['LogLifetimeVolume']
X = data[cols]
CoefNames = pd.DataFrame(np.transpose(cols), columns = ['Variable'])

X_scaled = preprocessing.scale(X)

In [43]:
reg = linear_model.LogisticRegression(random_state=0, solver='liblinear', penalty='l1', C = .5).fit(X_scaled, y)

In [68]:
reg = linear_model.LogisticRegressionCV(random_state=0,  cv = 5, Cs = ([.1, 1, 10, 100, 1000, 10000]),
                                       solver='liblinear', penalty='l1').fit(X_scaled, y)

In [69]:
reg.C_

array([0.01])

In [70]:
y_hat = reg.predict(X_scaled)
y_prob = reg.predict_proba(X_scaled)

data['PredFinish'] = y_hat
data['FinishProb'] = y_prob[:,1]

In [71]:
# data['FinishProb'].hist()
data['Finished'].describe()

count    3000.000000
mean        0.153333
std         0.360364
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Finished, dtype: float64

In [72]:
data['FinishProb'].describe()

count    3000.000000
mean        0.186667
std         0.010863
min         0.173757
25%         0.173757
50%         0.192996
75%         0.192996
max         0.202094
Name: FinishProb, dtype: float64

In [73]:
params = reg.coef_
coefs = pd.DataFrame(np.transpose(params), columns = ['Beta'])
CoefEsts = CoefNames.join(coefs)

In [74]:
CoefEsts

Unnamed: 0,Variable,Beta
0,InfoDummy,0.000000
1,ServiceDummy,0.000000
2,CustomerDummy,0.000000
3,SupportDummy,0.000000
4,AdminDummy,0.000000
5,ContactDummy,0.000000
6,PaymentDummy,0.000000
7,MarketingDummy,0.000000
8,BillingDummy,0.000000
9,SalesDummy,0.000000


In [66]:
data['GenericDummy'] = (data['AdminDummy'] == 1) | (data['SalesDummy'] == 1) | (data['ContactDummy'] == 1) | (data['SupportDummy'] == 1)
data['NonProfitDummy'] = (data['PrimaryIndustry'] == "Non-profit") | (data['LegalType'] == "Non Profit")
data['DropDummy'] = (data['GenericDummy'] == 1) | (data['NonProfitDummy'] == 1) | (data['TimeGroups'] > '2 Months')
data['GenericDummy'].value_counts()

False    2855
True      145
Name: GenericDummy, dtype: int64

In [67]:
data[data['DropDummy'] != 1].FinishProb.describe()

count    1644.000000
mean        0.193128
std         0.056383
min         0.045511
25%         0.157963
50%         0.185356
75%         0.228737
max         0.376349
Name: FinishProb, dtype: float64