<a href="https://colab.research.google.com/github/elishaaquino/KickstarterAnalysis/blob/master/Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Predicting Success from Past Projects**

In [0]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MaxAbsScaler, Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV

## **Finding a Good Model and Tuning Parameters**

In [0]:
df_projects = pd.read_csv(
    "/content/pastProjects.csv",
    index_col=0)
df_projects = df_projects[df_projects["state"] != "live"]
df_projects.reset_index(inplace=True)

df_projects["state"].replace("canceled", "failed", inplace=True)
df_projects["state"].replace("suspended", "failed", inplace=True)

X_train = df_projects[["goal", "Parent Category"]]
y_train = df_projects["state"]

In [0]:
classifiers = [
    KNeighborsClassifier(10),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

ct = make_column_transformer(
      (OneHotEncoder(), ["Parent Category"]),
      remainder="passthrough"  
    )

from sklearn.preprocessing import FunctionTransformer
c = 0
for classifier in classifiers:
  model = make_pipeline(
      ct,
      StandardScaler(with_mean=False),
      classifier
  )
  if c > 6:
    model = make_pipeline(
      ct,
      StandardScaler(with_mean=False),
      FunctionTransformer(lambda x: x.todense(), accept_sparse=True),
      classifier
    )
  c+=1
  print(cross_val_score(model, X=X_train, y=y_train, cv=5,
                           scoring="f1_macro").mean(), str(classifier))


0.6252948222773315 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')
0.590841363265522 SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
0.5976374832357292 SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=2, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
0.6268408588321848 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
              



0.5644454427187207 QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)




**It seems that the best model for prediction is Ada Boost Classifier, KNeighbors Classifier, and Decision Tree Classifier.**

### Ada Boost Classifier

In [0]:
def get_cv_error(features):
  ct = make_column_transformer(
      (OneHotEncoder(), ["Parent Category"]),
      remainder="passthrough"  
    )
  if "blurb" in features:
    ct = make_column_transformer(
      (TfidfVectorizer(norm=None, max_features=10), "blurb"),
      remainder="passthrough"  
    )
  if "blurb" in features and "country" in features:
    ct = make_column_transformer(
      (TfidfVectorizer(norm=None, max_features=10), "blurb"),
      (OneHotEncoder(), ["country"]),
      remainder="passthrough"  
    )
  if len(features) >= 3:
    ct = make_column_transformer(
        (OneHotEncoder(), features[2:]),
        remainder="passthrough"  
      )
  if len(features) >= 3 and "blurb" in features:
    ct = make_column_transformer(
        (TfidfVectorizer(norm=None, max_features=10), "blurb"),
        (OneHotEncoder(), ["Parent Category"]),
        remainder="passthrough"  
      )
  model = make_pipeline(
    ct,
    StandardScaler(with_mean=False),
    AdaBoostClassifier()
  )
  if features[0] == "goal" and len(features) == 1:
    model = make_pipeline(
    StandardScaler(with_mean=False),
    AdaBoostClassifier()
  )

  cv_errs = cross_val_score(model, X=df_projects[features], 
                            y=y_train,
                            scoring="f1_macro", cv=10)

  return cv_errs.mean()

errs = pd.Series()
for features in [["blurb"],
                 ["blurb", "country"],
                 ["blurb", "goal"],
                 ["goal"],
                 ["goal", "Parent Category"],
                 ["goal", "Amount of Time Live", "Parent Category"],
                 ["goal", "Amount of Time Live", "Parent Category", 
                  "staff_pick"],
                 ["goal", "Amount of Time Live", "Parent Category", 
                  "staff_pick", "country"],
                 ["goal", "Amount of Time Live", "Parent Category",
                  "blurb"]]:
  errs[str(features)] = get_cv_error(features)

In [0]:
errs

['blurb']                                                                      0.500639
['blurb', 'country']                                                           0.525202
['blurb', 'goal']                                                              0.597568
['goal']                                                                       0.567671
['goal', 'Parent Category']                                                    0.631470
['goal', 'Amount of Time Live', 'Parent Category']                             0.655444
['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick']               0.697752
['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick', 'country']    0.700191
['goal', 'Amount of Time Live', 'Parent Category', 'blurb']                    0.663899
dtype: float64

**The best features for the Ada Boost Classifier are ['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick', 'country'].**

### KNeighbors Classifier

In [0]:
def get_cv_error(features):
  ct = make_column_transformer(
      (OneHotEncoder(), ["Parent Category"]),
      remainder="passthrough"  
    )
  if "blurb" in features:
    ct = make_column_transformer(
      (TfidfVectorizer(norm=None, max_features=10), "blurb"),
      remainder="passthrough"  
    )
  if "blurb" in features and "country" in features:
    ct = make_column_transformer(
      (TfidfVectorizer(norm=None, max_features=10), "blurb"),
      (OneHotEncoder(), ["country"]),
      remainder="passthrough"  
    )
  if len(features) >= 3:
    ct = make_column_transformer(
        (OneHotEncoder(), features[2:]),
        remainder="passthrough"  
      )
  if len(features) >= 3 and "blurb" in features:
    ct = make_column_transformer(
        (TfidfVectorizer(norm=None, max_features=10), "blurb"),
        (OneHotEncoder(), ["Parent Category"]),
        remainder="passthrough"  
      )
  model = make_pipeline(
    ct,
    StandardScaler(with_mean=False),
    KNeighborsClassifier(10)
  )
  if features[0] == "goal" and len(features) == 1:
    model = make_pipeline(
    StandardScaler(with_mean=False),
    KNeighborsClassifier(10)
  )

  cv_errs = cross_val_score(model, X=df_projects[features], 
                            y=y_train,
                            scoring="f1_macro", cv=10)

  return cv_errs.mean()

errs = pd.Series()
for features in [["blurb"],
                 ["blurb", "country"],
                 ["blurb", "goal"],
                 ["goal"],
                 ["goal", "Parent Category"],
                 ["goal", "Amount of Time Live", "Parent Category"],
                 ["goal", "Amount of Time Live", "Parent Category", 
                  "staff_pick"],
                 ["goal", "Amount of Time Live", "Parent Category", 
                  "staff_pick", "country"],
                 ["goal", "Amount of Time Live", "Parent Category",
                  "blurb"]]:
  errs[str(features)] = get_cv_error(features)

In [0]:
errs

['blurb']                                                                      0.534344
['blurb', 'country']                                                           0.539241
['blurb', 'goal']                                                              0.573363
['goal']                                                                       0.565809
['goal', 'Parent Category']                                                    0.619787
['goal', 'Amount of Time Live', 'Parent Category']                             0.641554
['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick']               0.673478
['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick', 'country']    0.673808
['goal', 'Amount of Time Live', 'Parent Category', 'blurb']                    0.616857
dtype: float64

In [0]:
X_train = df_projects[['goal', 'Amount of Time Live', 'Parent Category',
                       'staff_pick', 'country']]
y_train = df_projects['state']

ct = make_column_transformer(
  (OneHotEncoder(), ['Parent Category',
                       'staff_pick', 'country']),
  remainder="passthrough"  
)
model = make_pipeline(
  ct,
  StandardScaler(with_mean=False),
  KNeighborsClassifier(10)
)

In [0]:
grid_search = GridSearchCV(
    model,
    param_grid={"kneighborsclassifier__n_neighbors": range(1, 20)},
    scoring="f1_macro",
    cv=10
)

grid_search.fit(X_train, y_train)
grid_search.best_params_

{'kneighborsclassifier__n_neighbors': 18}

In [0]:
ct = make_column_transformer(
  (OneHotEncoder(), ['Parent Category',
                       'staff_pick', 'country']),
  remainder="passthrough"  
)
model = make_pipeline(
  ct,
  StandardScaler(with_mean=False),
  KNeighborsClassifier(18)
)
cross_val_score(model, X=X_train, 
                            y=y_train,
                            scoring="f1_macro", cv=10).mean()

0.6782307246519049

**The best features for the KNeighbors Classifier are ['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick', 'country'] with k = 18.**

### Decision Tree Classifier

In [0]:
def get_cv_error(features):
  ct = make_column_transformer(
      (OneHotEncoder(), ["Parent Category"]),
      remainder="passthrough"  
    )
  if "blurb" in features:
    ct = make_column_transformer(
      (TfidfVectorizer(norm=None, max_features=10), "blurb"),
      remainder="passthrough"  
    )
  if "blurb" in features and "country" in features:
    ct = make_column_transformer(
      (TfidfVectorizer(norm=None, max_features=10), "blurb"),
      (OneHotEncoder(), ["country"]),
      remainder="passthrough"  
    )
  if len(features) >= 3:
    ct = make_column_transformer(
        (OneHotEncoder(), features[2:]),
        remainder="passthrough"  
      )
  if len(features) >= 3 and "blurb" in features:
    ct = make_column_transformer(
        (TfidfVectorizer(norm=None, max_features=10), "blurb"),
        (OneHotEncoder(), ["Parent Category"]),
        remainder="passthrough"  
      )
  model = make_pipeline(
    ct,
    StandardScaler(with_mean=False),
    DecisionTreeClassifier(max_depth=5)
  )
  if features[0] == "goal" and len(features) == 1:
    model = make_pipeline(
    StandardScaler(with_mean=False),
    DecisionTreeClassifier(max_depth=5)
  )

  cv_errs = cross_val_score(model, X=df_projects[features], 
                            y=y_train,
                            scoring="f1_macro", cv=10)

  return cv_errs.mean()

errs = pd.Series()
for features in [["blurb"],
                 ["blurb", "country"],
                 ["blurb", "goal"],
                 ["goal"],
                 ["goal", "Parent Category"],
                 ["goal", "Amount of Time Live", "Parent Category"],
                 ["goal", "Amount of Time Live", "Parent Category", 
                  "staff_pick"],
                 ["goal", "Amount of Time Live", "Parent Category", 
                  "staff_pick", "country"],
                 ["goal", "Amount of Time Live", "Parent Category",
                  "blurb"]]:
  errs[str(features)] = get_cv_error(features)

In [0]:
errs

['blurb']                                                                      0.467923
['blurb', 'country']                                                           0.477953
['blurb', 'goal']                                                              0.585769
['goal']                                                                       0.576818
['goal', 'Parent Category']                                                    0.632335
['goal', 'Amount of Time Live', 'Parent Category']                             0.638376
['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick']               0.667221
['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick', 'country']    0.666979
['goal', 'Amount of Time Live', 'Parent Category', 'blurb']                    0.638782
dtype: float64

In [0]:
X_train = df_projects[['goal', 'Amount of Time Live', 'Parent Category',
                       'staff_pick']]
y_train = df_projects['state']

ct = make_column_transformer(
  (OneHotEncoder(), ['Parent Category',
                       'staff_pick']),
  remainder="passthrough"  
)
model = make_pipeline(
  ct,
  StandardScaler(with_mean=False),
  DecisionTreeClassifier(max_depth=5)
)

In [0]:
grid_search = GridSearchCV(
    model,
    param_grid={"decisiontreeclassifier__max_depth": range(1, 20)},
    scoring="f1_macro",
    cv=10
)

grid_search.fit(X_train, y_train)
grid_search.best_params_

{'decisiontreeclassifier__max_depth': 15}

In [0]:
ct = make_column_transformer(
  (OneHotEncoder(), ['Parent Category',
                       'staff_pick']),
  remainder="passthrough"  
)
model = make_pipeline(
  ct,
  StandardScaler(with_mean=False),
  DecisionTreeClassifier(max_depth=15)
)

cross_val_score(model, X=X_train, 
                            y=y_train,
                            scoring="f1_macro", cv=10).mean()

0.6879193388110117

**The best features for the Decision Tree Classifier are ['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick'] with max depth = 15.**

## **Testing with other past projects to get test error**

### Ada Boost Classifier

In [0]:
df_test = pd.read_csv("/content/testProjects.csv")
df_test = df_test[df_test["state"] != "live"]
df_test.reset_index(inplace=True)

df_test["state"].replace("canceled", "failed", inplace=True)
df_test["state"].replace("suspended", "failed", inplace=True)

In [0]:
features = ['goal', 'Amount of Time Live', 'Parent Category',
                       'staff_pick', 'country']
X_train = df_projects[features]
y_train = df_projects['state']
X_test = df_test[features]

ct = make_column_transformer(
        (OneHotEncoder(), ['Parent Category', 'staff_pick', 'country']),
        remainder="passthrough"  
      )
adaboost_model = make_pipeline(
            ct,
            StandardScaler(with_mean=False),
            AdaBoostClassifier()
          )

adaboost_model.fit(X=X_train, y=y_train)
y_test = adaboost_model.predict(X_test)

In [0]:
df_results = pd.DataFrame({"y_test":y_test, "y_actual": df_test["state"]})

In [0]:
from sklearn.metrics import precision_score, recall_score

(precision_score(y_test, df_test["state"], pos_label='successful'),
 recall_score(y_test, df_test["state"], pos_label='successful'))

(0.8189539561913277, 0.7739754964089565)

In [0]:
(precision_score(y_test, df_test["state"], pos_label='failed'),
 recall_score(y_test, df_test["state"], pos_label='failed'))

(0.6071953010279001, 0.6712662337662337)

### KNeighbors Classifier

In [0]:
ct = make_column_transformer(
        (OneHotEncoder(), ['Parent Category', 'staff_pick', 'country']),
        remainder="passthrough"  
      )
kneighbor_model = make_pipeline(
            ct,
            StandardScaler(with_mean=False),
            KNeighborsClassifier(18)
          )

kneighbor_model.fit(X=X_train, y=y_train)
y_test = kneighbor_model.predict(X_test)

In [0]:
df_results = pd.DataFrame({"y_test":y_test, "y_actual": df_test["state"]})

In [0]:
(precision_score(y_test, df_test["state"], pos_label='successful'),
 recall_score(y_test, df_test["state"], pos_label='successful'))

(0.7344658024139472, 0.7648975791433892)

In [0]:
(precision_score(y_test, df_test["state"], pos_label='failed'),
 recall_score(y_test, df_test["state"], pos_label='failed'))

(0.6292217327459618, 0.5906271536871124)

### Decision Tree Classifier

In [0]:
features = ['goal', 'Amount of Time Live', 'Parent Category',
                       'staff_pick']
X_train = df_projects[features]
y_train = df_projects['state']
X_test = df_test[features]

ct = make_column_transformer(
        (OneHotEncoder(), ['Parent Category', 'staff_pick']),
        remainder="passthrough"  
      )
decisiontree_model = make_pipeline(
            ct,
            StandardScaler(with_mean=False),
            DecisionTreeClassifier(max_depth=15)
          )

decisiontree_model.fit(X=X_train, y=y_train)
y_test = decisiontree_model.predict(X_test)

In [0]:
df_results = pd.DataFrame({"y_test":y_test, "y_actual": df_test["state"]})

In [0]:
(precision_score(y_test, df_test["state"], pos_label='successful'),
 recall_score(y_test, df_test["state"], pos_label='successful'))

(0.7621814930710773, 0.7656039515042659)

In [0]:
(precision_score(y_test, df_test["state"], pos_label='failed'),
 recall_score(y_test, df_test["state"], pos_label='failed'))

(0.6167400881057269, 0.6122448979591837)

## **Results**

**Ada Boost Results:**


*   Features: ['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick', 'country']
*   Precision for successful: 0.8189539561913277
*   Recall for successful: 0.7739754964089565
*   Precision for failed: 0.6071953010279001
*   Recall for failed: 0.6712662337662337



**KNeighbors Results:**


*   Features: ['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick', 'country']
*   K = 18
*   Precision for successful: 0.7344658024139472
*   Recall for successful: 0.7648975791433892
*   Precision for failed: 0.6292217327459618
*   Recall for failed: 0.5906271536871124



**Decision Tree Results:**


*   Features: ['goal', 'Amount of Time Live', 'Parent Category', 'staff_pick']
*   max_depth = 15
*   Precision for successful: 0.7621814930710773 
*   Recall for successful: 0.7656039515042659
*   Precision for failed: 0.6167400881057269
*   Recall for failed: 0.6122448979591837



**Overall Results:**


*   Ada Boost Classifier has the best F1 score for both successful and recall

