# INITIAL MODEL EXPLORATION: CORPORATE CUSTOMER 

In [13]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
kick_proc = pd.read_pickle('../../data/03_processed/kick_proc.pkl')

In [21]:
kick_proc.columns

Index(['currency', 'goal_original', 'category', 'country', 'blurb_word_count',
       'campaign_length', 'delta_created_launched', 'goal_usd',
       'target_dummy'],
      dtype='object')

In [4]:
kick_proc.head()

Unnamed: 0,currency,goal_original,category,country,blurb_word_count,campaign_length,delta_created_launched,goal_usd,target_dummy
0,USD,5000.0,Tabletop Games,US,9.0,32,13,5000.0,1
1,USD,3500.0,Music,US,14.0,18,1,3500.0,1
2,USD,500.0,Zines,US,17.0,15,1,500.0,1
3,USD,6800.0,Graphic Novels,US,12.0,30,6,6800.0,1
4,USD,600.0,Mixed Media,US,19.0,30,0,600.0,1


In [5]:
kick_proc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275642 entries, 0 to 275641
Data columns (total 9 columns):
currency                  275642 non-null object
goal_original             275642 non-null float64
category                  275642 non-null object
country                   275642 non-null object
blurb_word_count          275642 non-null float64
campaign_length           275642 non-null int64
delta_created_launched    275642 non-null int64
goal_usd                  275642 non-null float64
target_dummy              275642 non-null uint8
dtypes: float64(3), int64(2), object(3), uint8(1)
memory usage: 17.1+ MB


## Baseline Model - Decision Tree

In [6]:
y = kick_proc['target_dummy']
df_no_y = kick_proc.drop('target_dummy', axis=1)
X = pd.get_dummies(df_no_y)

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275642 entries, 0 to 275641
Columns: 377 entries, goal_original to country_ZW
dtypes: float64(3), int64(2), uint8(372)
memory usage: 108.3 MB


In [23]:
kick_proc['country'].value_counts(normalize = True)

US    0.730310
GB    0.093353
CA    0.040397
AU    0.020302
DE    0.013126
FR    0.010822
IT    0.009904
MX    0.008344
ES    0.008282
NL    0.007339
SE    0.005627
NZ    0.004176
HK    0.003635
DK    0.003566
JP    0.002884
SG    0.002677
CH    0.002608
IE    0.002500
BE    0.002043
NO    0.001970
AT    0.001818
CN    0.001556
IN    0.000947
PL    0.000903
UA    0.000798
TW    0.000744
KR    0.000733
IL    0.000715
TH    0.000668
PR    0.000646
        ...   
MV    0.000007
TJ    0.000007
TC    0.000007
VC    0.000007
AX    0.000007
GI    0.000007
MQ    0.000007
SX    0.000007
AL    0.000007
GQ    0.000007
CK    0.000007
VA    0.000004
GA    0.000004
ME    0.000004
MR    0.000004
BH    0.000004
KY    0.000004
BJ    0.000004
MO    0.000004
SC    0.000004
CF    0.000004
KI    0.000004
BM    0.000004
PN    0.000004
TD    0.000004
CV    0.000004
SA    0.000004
DJ    0.000004
GD    0.000004
KN    0.000004
Name: country, Length: 199, dtype: float64

In [15]:
X.head()

Unnamed: 0,goal_original,blurb_word_count,campaign_length,delta_created_launched,goal_usd,currency_AUD,currency_CAD,currency_CHF,currency_DKK,currency_EUR,currency_GBP,currency_HKD,currency_JPY,currency_MXN,currency_NOK,currency_NZD,currency_SEK,currency_SGD,currency_USD,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,category_Anthologies,category_Apparel,category_Apps,category_Architecture,category_Art,category_Art Books,category_Audio,category_Bacon,category_Blues,category_Calendars,category_Camera Equipment,category_Candles,category_Ceramics,category_Children's Books,category_Childrenswear,category_Chiptune,category_Civic Design,category_Classical Music,category_Comedy,category_Comic Books,category_Comics,category_Community Gardens,category_Conceptual Art,category_Cookbooks,category_Country & Folk,category_Couture,category_Crafts,category_Crochet,category_DIY,category_DIY Electronics,category_Dance,category_Design,category_Digital Art,category_Documentary,category_Drama,category_Drinks,category_Electronic Music,category_Embroidery,category_Events,category_Experimental,category_Fabrication Tools,category_Faith,category_Family,category_Fantasy,category_Farmer's Markets,category_Farms,category_Fashion,category_Festivals,category_Fiction,category_Film & Video,category_Fine Art,category_Flight,category_Food,category_Food Trucks,category_Footwear,category_Gadgets,category_Games,category_Gaming Hardware,category_Glass,category_Graphic Design,category_Graphic Novels,category_Hardware,category_Hip-Hop,category_Horror,category_Illustration,category_Immersive,category_Indie Rock,category_Installations,category_Interactive Design,category_Jazz,category_Jewelry,category_Journalism,category_Kids,category_Knitting,category_Latin,category_Letterpress,category_Literary Journals,category_Literary Spaces,category_Live Games,category_Makerspaces,category_Metal,category_Mixed Media,category_Mobile Games,category_Movie Theaters,category_Music,category_Music Videos,category_Musical,category_Narrative Film,category_Nature,category_Nonfiction,category_Painting,category_People,category_Performance Art,category_Performances,category_Periodicals,category_Pet Fashion,category_Photo,category_Photobooks,category_Photography,category_Places,category_Playing Cards,category_Plays,category_Poetry,category_Pop,category_Pottery,category_Print,category_Printing,category_Product Design,category_Public Art,category_Publishing,category_Punk,category_Puzzles,category_Quilts,category_R&B,category_Radio & Podcasts,category_Ready-to-wear,category_Residencies,category_Restaurants,category_Robots,category_Rock,category_Romance,category_Science Fiction,category_Sculpture,category_Shorts,category_Small Batch,category_Software,category_Sound,category_Space Exploration,category_Spaces,category_Stationery,category_Tabletop Games,category_Taxidermy,category_Technology,category_Television,category_Textiles,category_Theater,category_Thrillers,category_Translations,category_Typography,category_Vegan,category_Video,category_Video Art,category_Video Games,category_Wearables,category_Weaving,category_Web,category_Webcomics,category_Webseries,category_Woodworking,category_Workshops,category_World Music,category_Young Adult,category_Zines,country_AE,country_AF,country_AG,country_AL,country_AM,country_AQ,country_AR,country_AT,country_AU,country_AX,country_AZ,country_BA,country_BB,country_BD,country_BE,country_BF,country_BG,country_BH,country_BJ,country_BM,country_BO,country_BR,country_BS,country_BT,country_BW,country_BY,country_BZ,country_CA,country_CD,country_CF,country_CG,country_CH,country_CI,country_CK,country_CL,country_CM,country_CN,country_CO,country_CR,country_CU,country_CV,country_CW,country_CY,country_CZ,country_DE,country_DJ,country_DK,country_DM,country_DO,country_DZ,country_EC,country_EE,country_EG,country_ES,country_ET,country_FI,country_FJ,country_FM,country_FO,country_FR,country_GA,country_GB,country_GD,country_GE,country_GH,country_GI,country_GL,country_GM,country_GN,country_GP,country_GQ,country_GR,country_GT,country_GU,country_GY,country_HK,country_HN,country_HR,country_HT,country_HU,country_ID,country_IE,country_IL,country_IN,country_IQ,country_IR,country_IS,country_IT,country_JM,country_JO,country_JP,country_KE,country_KG,country_KH,country_KI,country_KN,country_KP,country_KR,country_KW,country_KY,country_KZ,country_LA,country_LB,country_LC,country_LK,country_LR,country_LS,country_LT,country_LU,country_LV,country_LY,country_MA,country_MC,country_MD,country_ME,country_MG,country_MK,country_ML,country_MM,country_MN,country_MO,country_MQ,country_MR,country_MT,country_MU,country_MV,country_MW,country_MX,country_MY,country_MZ,country_NC,country_NE,country_NG,country_NI,country_NL,country_NO,country_NP,country_NZ,country_PA,country_PE,country_PF,country_PG,country_PH,country_PK,country_PL,country_PN,country_PR,country_PS,country_PT,country_PY,country_QA,country_RO,country_RS,country_RU,country_RW,country_SA,country_SC,country_SD,country_SE,country_SG,country_SI,country_SJ,country_SK,country_SL,country_SN,country_SO,country_SR,country_SS,country_SV,country_SX,country_SY,country_SZ,country_TC,country_TD,country_TH,country_TJ,country_TL,country_TN,country_TO,country_TR,country_TT,country_TW,country_TZ,country_UA,country_UG,country_US,country_UY,country_VA,country_VC,country_VE,country_VI,country_VN,country_VU,country_WS,country_XK,country_YE,country_ZA,country_ZM,country_ZW
0,5000.0,9.0,32,13,5000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,3500.0,14.0,18,1,3500.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,500.0,17.0,15,1,500.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6800.0,12.0,30,6,6800.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,600.0,19.0,30,0,600.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


**The target variable is reasonably balanced. we will revisit this again later to figure out if we need to balance it** 

In [8]:
y.value_counts()

1    146584
0    129058
Name: target_dummy, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [20]:
print('Train Set: \n', y_train.value_counts()/len(y_train))
print('Test Set: \n', y_test.value_counts()/len(y_test))

Train Set: 
 1    0.531793
0    0.468207
Name: target_dummy, dtype: float64
Test Set: 
 1    0.531787
0    0.468213
Name: target_dummy, dtype: float64


In [10]:
dtc = DecisionTreeClassifier()

In [11]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [12]:
y_pred = dtc.predict(X_test)

In [14]:
accuracy_score(y_test, y_pred)

0.7064329352353035

In [29]:
feature_importance = pd.DataFrame(X.columns, dtc.feature_importances_).reset_index()

Think about grouping the country by region (decrease the number of variables) 

In [39]:
feature_importance.sort_values(by = 'index', ascending= False)

Unnamed: 0,index,0
3,0.131690,delta_created_launched
4,0.127631,goal_usd
1,0.098501,blurb_word_count
2,0.072732,campaign_length
0,0.065301,goal_original
155,0.039273,category_Tabletop Games
132,0.038309,category_Product Design
58,0.030515,category_Documentary
148,0.022899,category_Shorts
77,0.011924,category_Food
