In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.sparse import vstack, hstack
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

In [8]:
dataset = pd.read_csv('Output/data_clean&processed.csv')
dataset = dataset.drop('Unnamed: 0', axis = 1)

dataset.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'current_currency', 'deadline', 'fx_rate',
       'goal', 'id', 'launched_at', 'location', 'name', 'pledged', 'profile',
       'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type', 'story', 'faq', 'num_faq', 'comments', 'n_comments',
       'duration', 'parent_category', 'category_name', 'location_name',
       'month_launched', 'year_launched', 'backers_count_log', 'goal_log',
       'pledged_log', 'duration_log', 'n_comments_log', 'num_faq_bool',
       'avg_fund_per_backer', 'comments_afinn', 'blurb_name', 'faq_comments'],
      dtype='object')

In [10]:
dataset.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,current_currency,deadline,fx_rate,goal,...,backers_count_log,goal_log,pledged_log,duration_log,n_comments_log,num_faq_bool,avg_fund_per_backer,comments_afinn,blurb_name,faq_comments
0,1,With your help we will create this device that...,"{'id': 331, 'name': '3D Printing', 'slug': 'te...",1,ES,2015-08-18 21:01,USD,2016-07-09 20:11,1.212886,15000,...,-1.390561,0.689521,-1.915118,-0.012145,-0.710711,0,1.0,-1.0,With your help we will create this device that...,[' ']No comments yet.
1,2,We at Ormiston Primary are looking at starting...,"{'id': 309, 'name': 'Farms', 'slug': 'food/far...",9,NZ,2015-08-11 18:04,USD,2015-09-11 15:55,0.723585,5000,...,-1.16855,0.107232,-1.25713,-0.012145,-0.710711,0,7.5,-1.0,We at Ormiston Primary are looking at starting...,[' ']No comments yet.
2,0,Self-taught aspiring metalsmith Looking for he...,"{'id': 54, 'name': 'Mixed Media', 'slug': 'art...",0,US,2015-04-28 21:14,USD,2015-05-28 21:14,1.0,10000,...,-1.77009,0.474607,-2.134448,-0.099817,-0.710711,0,0.0,-1.0,Self-taught aspiring metalsmith Looking for he...,[' ']No comments yet.
3,0,So many women believe they are past their prim...,"{'id': 278, 'name': 'People', 'slug': 'photogr...",0,US,2014-07-07 1:30,USD,2014-10-26 0:00,1.0,2000,...,-1.77009,-0.378322,-2.134448,-0.012145,-0.710711,0,0.0,-1.0,So many women believe they are past their prim...,[' ']No comments yet.
4,10,The Horror Zine's Jeani Rector brings us anoth...,"{'id': 324, 'name': 'Anthologies', 'slug': 'pu...",340,US,2014-11-04 16:30,USD,2014-12-09 9:20,1.0,2500,...,-0.457135,-0.260089,-0.289093,-0.012145,-0.23295,0,34.0,2.0,The Horror Zine's Jeani Rector brings us anoth...,[' ']Paula Limbaugh\nover 6 years ago\nSo so...


In [56]:
numerical_features = ['goal','backers_count_log','goal_log','pledged_log','duration_log','n_comments_log']

In [59]:
X = dataset[numerical_features]
y = dataset.state
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)
y_pred_class.sum()
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
print('Confusion Matrix: \n', metrics.confusion_matrix(y_test, y_pred_class))

Accuracy:  0.4089068825910931
Confusion Matrix: 
 [[101   0]
 [146   0]]


In [11]:
from sklearn import metrics
vect = CountVectorizer()
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    print(X_train_dtm.shape)
    # print the number of features that were generated
    print('Features: ', X_train_dtm.shape[1])
    
    # use Multinomial Naive Bayes to predict the star rating
    logreg = LogisticRegression()
    logreg.fit(X_train_dtm, y_train)
    y_pred_class = logreg.predict(X_test_dtm)
    print(logreg.coef_)
    # print(y_test)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
    print('Confusion Matrix: ', metrics.confusion_matrix(y_test, y_pred_class))
    # print('Recall: ', metrics.recall_score(y_test, y_pred_class))
    # print('Precision: ', metrics.precision_score(y_test, y_pred_class))
    # print('AUC Score: ', metrics.roc_auc_score(y_test, y_pred_class))
    # print('F1 Score: ', metrics.f1_score(y_test, y_pred_class))
    print('Classification Report: ', metrics.classification_report(y_test, y_pred_class))
    # metrics.plot_precision_recall_curve(y_test, y_pred_class)

In [55]:
# X = dataset.drop('state', axis=1)
text_cols = ['blurb', 'name', 'story', 'faq', 'comments', 'blurb_name', 'faq_comments']

for feature in text_cols:
    print(f"Feature is {feature}")
    X = dataset[feature]
    y = dataset.state

    # split X and y into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    tokenize_test(vect)
    print('\n')

Feature is blurb
(738, 4239)
Features:  4239
[[-0.35048018 -0.07374228 -0.48957714 ... -0.16426767  0.09614386
   0.11923515]]
Accuracy:  0.6275303643724697
Confusion Matrix:  [[ 29  72]
 [ 20 126]]
Classification Report:                precision    recall  f1-score   support

           0       0.59      0.29      0.39       101
           1       0.64      0.86      0.73       146

    accuracy                           0.63       247
   macro avg       0.61      0.58      0.56       247
weighted avg       0.62      0.63      0.59       247



Feature is name
(738, 2198)
Features:  2198
[[ 0.13820037  0.07712327 -0.17978679 ...  0.17395432 -0.27290668
  -0.32933818]]
Accuracy:  0.6072874493927125
Confusion Matrix:  [[ 20  81]
 [ 16 130]]
Classification Report:                precision    recall  f1-score   support

           0       0.56      0.20      0.29       101
           1       0.62      0.89      0.73       146

    accuracy                           0.61       247
   macro