In [87]:
%matplotlib inline

import numpy as np
import pandas as pd
import os, errno
import re
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.externals import joblib

base = '../../'

raw_data = base + 'Data/Clean/Survey/ResponseData.csv'
raw_data_stata = base + 'Data/Clean/Survey/ResponseData.dta'

# dataalt = pd.read_csv(raw_data)
data = pd.read_stata(raw_data_stata, order_categoricals = True)

In [88]:
y = data['Finished']

cols = [ x for x in data.columns.tolist() if re.match('^.*Dummy', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^LegalType_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^CompanyProfile_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^ConnectType_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^PrimaryIndustry_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^TimeGroups_[0-9]+', x)]
cols = cols + [ x for x in data.columns.tolist() if re.match('^Strata_[0-9]+', x)]
cols = cols + ['LogLifetimeVolume']
X = data[cols]
CoefNames = pd.DataFrame(np.transpose(cols), columns = ['Variable'])

X_scaled = preprocessing.scale(X)

In [89]:
reg = linear_model.LogisticRegression(random_state=0, solver='liblinear', penalty='l1', C = .5).fit(X_scaled, y)

In [108]:
reg = linear_model.LogisticRegressionCV(random_state=0,  cv = 5, Cs = ([.3, 1, 10, 100, 1000, 10000]),
                                       solver='liblinear', penalty='l1').fit(X_scaled, y)

In [109]:
reg.C_

array([0.3])

In [110]:
y_hat = reg.predict(X_scaled)
y_prob = reg.predict_proba(X_scaled)

data['PredFinish'] = y_hat
data['FinishProb'] = y_prob[:,1]

In [111]:
# data['FinishProb'].hist()
data['Finished'].describe()

count    3000.000000
mean        0.153333
std         0.360364
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Finished, dtype: float64

In [112]:
data['FinishProb'].describe()

count    3000.000000
mean        0.154444
std         0.072744
min         0.006170
25%         0.102146
50%         0.150648
75%         0.198567
max         0.422544
Name: FinishProb, dtype: float64

In [113]:
params = reg.coef_
coefs = pd.DataFrame(np.transpose(params), columns = ['Beta'])
CoefEsts = CoefNames.join(coefs)

In [114]:
CoefEsts

Unnamed: 0,Variable,Beta
0,InfoDummy,-0.010966
1,ServiceDummy,-0.027745
2,CustomerDummy,0.02427
3,SupportDummy,-0.030646
4,AdminDummy,-0.142225
5,ContactDummy,-0.047227
6,PaymentDummy,0.0
7,MarketingDummy,0.005678
8,BillingDummy,-0.002128
9,SalesDummy,-0.110108


In [115]:
data['GenericDummy'] = (data['AdminDummy'] == 1) | (data['SalesDummy'] == 1) | (data['ContactDummy'] == 1) | (data['SupportDummy'] == 1)
data['NonProfitDummy'] = (data['PrimaryIndustry'] == "Non-profit") | (data['LegalType'] == "Non Profit")
data['DropDummy'] = (data['GenericDummy'] == 1) | (data['NonProfitDummy'] == 1) | (data['TimeGroups'] > '2 Months')
data['GenericDummy'].value_counts()

False    2855
True      145
Name: GenericDummy, dtype: int64

In [116]:
data[data['DropDummy'] != 1].FinishProb.describe()

count    1644.000000
mean        0.191468
std         0.066361
min         0.025804
25%         0.149034
50%         0.183443
75%         0.233380
max         0.422544
Name: FinishProb, dtype: float64