In [114]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.sparse import vstack, hstack
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder

In [115]:
dataset = pd.read_csv('Output/Combined_dataset.csv')
dataset = dataset.drop('Unnamed: 0', axis=1)
dataset['duration'] = pd.to_numeric(pd.to_timedelta(dataset['duration']).dt.days, downcast='integer')
dataset['num_faq'] = dataset['num_faq'].astype(int)
dataset['n_comments'] = dataset['n_comments'].str.replace(',', '').astype(float)

dataset.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'current_currency', 'deadline', 'fx_rate',
       'goal', 'id', 'launched_at', 'location', 'name', 'pledged', 'profile',
       'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type', 'story', 'faq', 'num_faq', 'comments', 'n_comments',
       'duration'],
      dtype='object')

In [116]:
# Feature Engineering
# ['blurb', 'name', 'story', 'faq', 'comments']
dataset['blurb_name'] = dataset['blurb'] + dataset['name']
dataset['blurb_story'] = dataset['blurb'] + dataset['story']
dataset['name_faq'] = dataset['name'] + dataset['faq']
dataset['name_comments'] = dataset['name'] + dataset['comments']
dataset['name_faq_comments'] = dataset['faq'] + dataset['comments']

In [117]:
# Fill na
dataset.story = dataset.story.fillna('na')


# Decide whether to drop these columns later
# cols_to_drop = ['profile', 'category', 'created_at', 'location', 'current_currency', 'deadline', 'id', 'launched_at', 'slug', 'source_url', 'state_changed_at', 'urls', 'static_usd_rate', 'usd_pledged', 'converted_pledged_amount', 'spotlight']

# dataset = dataset.drop(cols_to_drop, axis=1)

In [124]:
# from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
vect = CountVectorizer()
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    print(X_train_dtm.shape)
    # print the number of features that were generated
    print('Features: ', X_train_dtm.shape[1])
    
    # use Multinomial Naive Bayes to predict the star rating
    logreg = LogisticRegression()
    logreg.fit(X_train_dtm, y_train)
    y_pred_class = logreg.predict(X_test_dtm)
    # print(y_test)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
    print('Confusion Matrix: ', metrics.confusion_matrix(y_test, y_pred_class))
    # print('Recall: ', metrics.recall_score(y_test, y_pred_class))
    # print('Precision: ', metrics.precision_score(y_test, y_pred_class))
    # print('AUC Score: ', metrics.roc_auc_score(y_test, y_pred_class))
    # print('F1 Score: ', metrics.f1_score(y_test, y_pred_class))
    print('Classification Report: ', metrics.classification_report(y_test, y_pred_class))
    # metrics.plot_precision_recall_curve(y_test, y_pred_class)

In [135]:
# X = dataset.drop('state', axis=1)
text_cols = ['blurb', 'name', 'story', 'faq', 'comments', 'blurb_name', 'blurb_story','name_faq', 'name_comments', 'name_faq_comments']

for feature in text_cols:
    print(f"Feature is {feature}")
    X = dataset[feature]
    y = dataset.state

    # split X and y into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    tokenize_test(vect)
    print('\n')

Feature is blurb
(750, 4228)
Features:  4228
Accuracy:  0.648
Confusion Matrix:  [[ 33  66]
 [ 22 129]]
Classification Report:                precision    recall  f1-score   support

      failed       0.60      0.33      0.43        99
  successful       0.66      0.85      0.75       151

    accuracy                           0.65       250
   macro avg       0.63      0.59      0.59       250
weighted avg       0.64      0.65      0.62       250



Feature is name
(750, 2164)
Features:  2164
Accuracy:  0.584
Confusion Matrix:  [[ 12  87]
 [ 17 134]]
Classification Report:                precision    recall  f1-score   support

      failed       0.41      0.12      0.19        99
  successful       0.61      0.89      0.72       151

    accuracy                           0.58       250
   macro avg       0.51      0.50      0.45       250
weighted avg       0.53      0.58      0.51       250



Feature is story
(750, 32088)
Features:  32088
Accuracy:  0.696
Confusion Matrix:  [[ 4

In [127]:
from afinn import Afinn
afinn = Afinn()
afinn_scores = [afinn.score(text) for text in dataset.story]
dataset['afinn'] = afinn_scores

In [128]:
dataset.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,current_currency,deadline,fx_rate,goal,...,num_faq,comments,n_comments,duration,blurb_name,blurb_story,name_faq,name_comments,name_faq_comments,afinn
0,1,With your help we will create this device that...,"{""id"":331,""name"":""3D Printing"",""slug"":""technol...",1,ES,2015-08-18 21:01,USD,2016-07-09 20:11,1.212886,15000,...,0,Only backers can post comments. Log in\nNo com...,0.0,30,With your help we will create this device that...,With your help we will create this device that...,Save water 100% liquid downloads in toilets at...,Save water 100% liquid downloads in toilets at...,[' ']Only backers can post comments. Log in\...,-10.0
1,2,We at Ormiston Primary are looking at starting...,"{""id"":309,""name"":""Farms"",""slug"":""food/farms"",""...",9,NZ,2015-08-11 18:04,USD,2015-09-11 15:55,0.723585,5000,...,0,Only backers can post comments. Log in\nNo com...,0.0,30,We at Ormiston Primary are looking at starting...,We at Ormiston Primary are looking at starting...,Ormiston Primary Community Garden[' '],Ormiston Primary Community GardenOnly backers ...,[' ']Only backers can post comments. Log in\...,15.0
2,0,Self-taught aspiring metalsmith Looking for he...,"{""id"":54,""name"":""Mixed Media"",""slug"":""art/mixe...",0,US,2015-04-28 21:14,USD,2015-05-28 21:14,1.0,10000,...,0,Only backers can post comments. Log in\nNo com...,0.0,29,Self-taught aspiring metalsmith Looking for he...,Self-taught aspiring metalsmith Looking for he...,"Aspiring metalsmith in need of better tools, a...","Aspiring metalsmith in need of better tools, a...",[' ']Only backers can post comments. Log in\...,19.0
3,0,So many women believe they are past their prim...,"{""id"":278,""name"":""People"",""slug"":""photography/...",0,US,2014-07-07 1:30,USD,2014-10-26 0:00,1.0,2000,...,0,Only backers can post comments. Log in\nNo com...,0.0,30,So many women believe they are past their prim...,So many women believe they are past their prim...,Beauty At Any Age[' '],Beauty At Any AgeOnly backers can post comment...,[' ']Only backers can post comments. Log in\...,14.0
4,10,The Horror Zine's Jeani Rector brings us anoth...,"{""id"":324,""name"":""Anthologies"",""slug"":""publish...",340,US,2014-11-04 16:30,USD,2014-12-09 9:20,1.0,2500,...,0,Only backers can post comments. Log in\nPaula ...,1.0,30,The Horror Zine's Jeani Rector brings us anoth...,The Horror Zine's Jeani Rector brings us anoth...,Shrieks and Shivers from the Horror Zine[' '],Shrieks and Shivers from the Horror ZineOnly b...,[' ']Only backers can post comments. Log in\...,2.0


In [129]:
# import spacy
# nlp = spacy.load("en_core_web_sm")

# nlp(dataset.story[0])