<a href="https://colab.research.google.com/github/charlie-may86/DSPT7-Twitoff/blob/master/Copy_of_Kickstarter_Simple_Model_DSPT7_Unit_3_Build.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Initial Imports

In [None]:
!pip install category_encoders==2.*

#Imports
import category_encoders as ce
import pandas as pd
import joblib
from joblib import dump, load
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import  f1_score, roc_auc_score, roc_curve, precision_score, recall_score, accuracy_score
from xgboost import XGBClassifier

#Download data, initial examination of data

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df1 = pd.read_csv('/content/drive/My Drive/clean_kickstart_data.csv')
df1.head()

In [8]:
df1['blurb'].dtype

dtype('O')

In [9]:
# Assign datatypes for
df1['blurb'] = df1['blurb'].astype(str)
df1['country'] = df1['country'].astype(str)
df1['campaign_success'] = df1['campaign_success'].astype(int)


In [None]:
df1.info()

#Train, Validate, Test Split

In [11]:
# Choose cutoffs based on unix time
cutoff1 = 1466003000
cutoff2 = 1530120000

test = df1[df1['launched_at']>=cutoff2]
train = df1[df1['launched_at']<cutoff2]
val = train[train['launched_at']>cutoff1] 
train = train[train['launched_at']<=cutoff1]

# Drop launched_at column, not used in model
train = train.drop(columns=['launched_at'])
val = val.drop(columns=['launched_at']) 
test = test.drop(columns=['launched_at'])

print(train.shape)
print(val.shape)
print(test.shape)

(86636, 7)
(45201, 7)
(47619, 7)


In [12]:
# Get ratio of defaulters for train
train['campaign_success'].value_counts(normalize=True)

1    0.55979
0    0.44021
Name: campaign_success, dtype: float64

#Target Vector / Features Matrix Split

In [13]:
# Target, is the client a defaulter
target = 'campaign_success'

# Features
features = list(train.drop(columns = [target]))

In [14]:
# Arrange data into X features matrix and y target vector 
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

In [15]:
X_train.head()

Unnamed: 0,blurb,country,goal,category,subcategory,campaign_length
1,JinBucha is a new kind of Brewery in North Par...,the United States,20000.0,Food,Drinks,30.0
3,Bringing Philly cheesesteaks (and other delici...,the United States,2000.0,Food,Food Trucks,59.958333
5,"Two aerialists, 100 feet of rope, and a myth d...",the United States,1800.0,Dance,Dance,15.0
8,I've written poems I believe in. I want to pub...,the United States,1800.0,Publishing,Poetry,30.0
9,We serve Jerk Chicken and Pork all over NC at ...,the United States,3500.0,Food,Food Trucks,35.0


# Wrangle Data


In [16]:
#Create and train transformations
# Encode catagorical features on X_train
encoder = ce.OrdinalEncoder(cols=['country', 'category', 'subcategory'])
encoder.fit(X_train)
print('')




  elif pd.api.types.is_categorical(cols):


In [17]:
def wrangle(X, encoder):

    # copy to avoid errors
    X = X.reset_index(drop=True).copy()

    X = encoder.transform(X)

    X['blurb_length'] = X['blurb'].apply(lambda x: len(x))
    X['blurb_words'] = X['blurb'].apply(lambda x: len(x.split()))
    X['blurb_uppers'] = X['blurb'].apply(lambda x: sum(map(str.isupper, x.split())))
    X['blurb_qmarks'] = X['blurb'].apply(lambda x: x.count("?"))
    X['blub_exclamation'] = X['blurb'].apply(lambda x: x.count("!"))
    X = X.drop(columns=['blurb'])

    return(X)

In [18]:
X_train_e = wrangle(X_train, encoder)
X_val_e = wrangle(X_val, encoder)
X_test_e = wrangle(X_test, encoder)
print(X_train_e.shape)
print(X_val_e.shape)
print(X_test_e.shape)


(86636, 10)
(45201, 10)
(47619, 10)


#Tracking Dataframe

In [19]:
# Create dataframe to track outcomes
columns =['Model','True -', 'False +', 'False -','True +','Accuracy', 'Precision', 'Recall']
tracker= pd.DataFrame(columns=columns)
tracker

Unnamed: 0,Model,True -,False +,False -,True +,Accuracy,Precision,Recall


In [20]:
# Function to add rows to tracking dataframe
# m_name = Model Name

def row_maker(m_name, y, pred):
  tn, fp, fn, tp = confusion_matrix(y, pred).ravel()
  recall = round((tp/(tp+fn)),2) # true positive rate
  n_recall = round((tn/(tn+fp)),2) #true negative rate
  precision = round((tp/(tp+fp)),2) # true positive rate
  top = tp+tn
  bottom = tn+fp+fn+tp
  accuracy = round(((tp+tn)/(tp+tn+fp+fn)),2) # accuracy
  # accuracy = round((top/bottom),2) # accuracy
  new_row = {'Model':m_name,'True -':tn, 'False +':fp, 'False -':fn,'True +':tp ,'Accuracy':accuracy,'Precision':precision, 'Recall':recall}
  return new_row

#XGBoost

In [21]:
""" Define Model"""
# First XGBoost Model
booster= XGBClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# XBG Simple
xgb_simple= booster.fit(X_train_e, y_train) #fit on train

##Testing XGBoost

In [22]:
""" Run Model XGBoost Simple """
xs_y_pred_v = booster.predict(X_val_e)

m_name = 'XGBoost Simple'
y = y_val
pred = xs_y_pred_v

print('Classification Report:\n\n', classification_report(y, pred))

# tracking dataframe
new_row=row_maker(m_name, y, pred)
tracker = tracker.append(new_row, ignore_index=True)
tracker

Classification Report:

               precision    recall  f1-score   support

           0       0.72      0.65      0.68     21196
           1       0.71      0.78      0.74     24005

    accuracy                           0.72     45201
   macro avg       0.72      0.71      0.71     45201
weighted avg       0.72      0.72      0.71     45201



Unnamed: 0,Model,True -,False +,False -,True +,Accuracy,Precision,Recall
0,XGBoost Simple,13708,7488,5375,18630,0.72,0.71,0.78


In [23]:
""" Run Model"""
xs_y_pred_test = booster.predict(X_test_e)

m_name = 'XGBoost Simple TEST'
model =  booster.fit(X_train_e, y_train)
X = X_test_e
y = y_test
pred = xs_y_pred_test

print('Classification Report:\n\n', classification_report(y, pred))

# tracking dataframe
new_row=row_maker(m_name, y, pred)
tracker = tracker.append(new_row, ignore_index=True)
tracker

Classification Report:

               precision    recall  f1-score   support

           0       0.66      0.63      0.65     15914
           1       0.82      0.84      0.83     31705

    accuracy                           0.77     47619
   macro avg       0.74      0.74      0.74     47619
weighted avg       0.77      0.77      0.77     47619



Unnamed: 0,Model,True -,False +,False -,True +,Accuracy,Precision,Recall
0,XGBoost Simple,13708,7488,5375,18630,0.72,0.71,0.78
1,XGBoost Simple TEST,10065,5849,5086,26619,0.77,0.82,0.84


In [24]:
# Select an observation to test
i=1
row = X_test_e.iloc[[i]] #select a row
a_row = y_test.iloc[[i]].values.tolist() # actual campaign success
p_row = booster.predict(row)# predicted success
pp_row = booster.predict_proba(row) #see the probability that this would be the choice
print(row)
print('Actual:', a_row)
print('Predicted:', p_row)
print('Probility of Prediction:', pp_row)
print('Probility of Prediction:',round((pp_row[0][1]),3))

   country   goal  category  ...  blurb_uppers  blurb_qmarks  blub_exclamation
1      1.0  800.0        13  ...             0             0                 0

[1 rows x 10 columns]
Actual: [1]
Predicted: [1]
Probility of Prediction: [[0.05640137 0.9435986 ]]
Probility of Prediction: 0.944


#Prepare JobLib Files

In [25]:
dump(encoder, 'encoder2.joblib' )
encoder2 = load('encoder2.joblib')

In [26]:
# Wrangle Function to 'Pickle'

def wrangler(X, encoder):
  X = pd.DataFrame.from_dict(X, orient='index')
  X = X.T
  X = X.reset_index(drop=True).copy()
  X.rename(columns={'x1':'goal',
                    'x2':'campaign_length',
                    'x3':'country',
                    'x4':'category',
                    'x5':'subcategory',
                    'x6':'blurb'}, inplace=True)

  X = encoder.transform(X)
  X = X[['country', 'goal', 'category', 'subcategory',
         'campaign_length', 'blurb']]

  X['blurb_length'] = X['blurb'].apply(lambda x: len(x))
  X['blurb_words'] = X['blurb'].apply(lambda x: len(x.split()))
  X['blurb_uppers'] = X['blurb'].apply(lambda x: sum(map(str.isupper, x.split())))
  X['blurb_qmarks'] = X['blurb'].apply(lambda x: x.count("?"))
  X['blub_exclamation'] = X['blurb'].apply(lambda x: x.count("!"))
  X = X.drop(columns=['blurb'])
  X = X.astype(int)
  return(X)


In [27]:

dump(wrangler, 'wrangler2.joblib' )
wrangler2 = load('wrangler2.joblib')

In [28]:
# XGBoost Model to Pickle
def kick_boost(X):
  prediction = booster.predict_proba(X)[0][1]
  prediction = round(prediction, 3)
  
  return(prediction)



In [29]:
dump(kick_boost, 'kick_boost2.joblib' )
kick_boost2=load('kick_boost2.joblib')

In [31]:
# Dummy Data
test_1 = {
  "x1": 10000,
  "x2": 8,
  "x3": "Canada",
  "x4": "Science",
  "x5": "Material Thread Science",
  "x6": "I am making somthing that will do something awesome!!! Answer the question 'will it be amazing?'"
}




In [33]:
result = wrangler2(test_1, encoder2)
prediction = kick_boost2(result)
prediction

0.413

In [None]:
row = X_test.iloc[[1]]
row_a = X_test_e.iloc[[1]]
test_2 = (row.to_dict(orient='record'))
test_2

In [None]:
test_2 = {
  "x1": 800,
  "x2": 14,
  "x3": "the United States",
  "x4": 'Design',
  "x5": 'Product Design',
  "x6": 'a frog plushie keychain and frog butt pin for all your strange froggy needs',
}
Y2 = wrangle1(test_2, encoder1)
prediction2 = kick_boost1(Y2)
prediction2