In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.sparse import vstack, hstack
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

In [2]:
dataset = pd.read_csv('Output/data_clean&processed_addedsentiment.csv')
# dataset = dataset.drop('Unnamed: 0', axis = 1)

dataset.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'current_currency', 'deadline', 'fx_rate',
       'goal', 'id', 'launched_at', 'location', 'name', 'pledged', 'profile',
       'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type', 'story', 'faq', 'num_faq', 'comments', 'n_comments',
       'duration', 'parent_category', 'category_name', 'location_name',
       'month_launched', 'year_launched', 'backers_count_log', 'goal_log',
       'pledged_log', 'duration_log', 'n_comments_log', 'num_faq_bool',
       'avg_fund_per_backer', 'comments_afinn', 'blurb_name', 'faq_comments',
       'story_afinn', 'faq_comments_afinn', 'blurb_name_afinn', 'blurb_afinn',
       'name_afinn'],
      dtype='object')

In [3]:
dataset.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,created_at,current_currency,deadline,fx_rate,goal,...,num_faq_bool,avg_fund_per_backer,comments_afinn,blurb_name,faq_comments,story_afinn,faq_comments_afinn,blurb_name_afinn,blurb_afinn,name_afinn
0,1,With your help we will create this device that...,"{'id': 331, 'name': '3D Printing', 'slug': 'te...",1,ES,2015-08-18 21:01,USD,2016-07-09 20:11,1.212886,15000,...,0,1.0,-1.0,With your help we will create this device that...,[' ']No comments yet.,-10.0,-1.0,4.0,2.0,2.0
1,2,We at Ormiston Primary are looking at starting...,"{'id': 309, 'name': 'Farms', 'slug': 'food/far...",9,NZ,2015-08-11 18:04,USD,2015-09-11 15:55,0.723585,5000,...,0,7.5,-1.0,We at Ormiston Primary are looking at starting...,[' ']No comments yet.,15.0,-1.0,1.0,1.0,0.0
2,0,Self-taught aspiring metalsmith Looking for he...,"{'id': 54, 'name': 'Mixed Media', 'slug': 'art...",0,US,2015-04-28 21:14,USD,2015-05-28 21:14,1.0,10000,...,0,0.0,-1.0,Self-taught aspiring metalsmith Looking for he...,[' ']No comments yet.,25.0,-1.0,4.0,2.0,2.0
3,0,So many women believe they are past their prim...,"{'id': 278, 'name': 'People', 'slug': 'photogr...",0,US,2014-07-07 1:30,USD,2014-10-26 0:00,1.0,2000,...,0,0.0,-1.0,So many women believe they are past their prim...,[' ']No comments yet.,14.0,-1.0,3.0,0.0,3.0
4,10,The Horror Zine's Jeani Rector brings us anoth...,"{'id': 324, 'name': 'Anthologies', 'slug': 'pu...",340,US,2014-11-04 16:30,USD,2014-12-09 9:20,1.0,2500,...,0,34.0,2.0,The Horror Zine's Jeani Rector brings us anoth...,[' ']Paula Limbaugh\nover 6 years ago\nSo so...,2.0,2.0,4.0,4.0,0.0


In [4]:
numerical_features = ['goal','converted_pledged_amount','backers_count','backers_count_log','goal_log','pledged_log','duration_log','n_comments_log','avg_fund_per_backer','comments_afinn','num_faq_bool','story_afinn','faq_comments_afinn', 'blurb_name_afinn', 'blurb_afinn','name_afinn']

text_features = ['blurb', 'name', 'story', 'faq', 'comments', 'blurb_name', 'faq_comments']
# categorical_features = ['country']
# ['', '', 'category', '',
#        '', 'created_at', 'current_currency', 'deadline', 'fx_rate',
#        'goal', 'id', 'launched_at', 'location', 'name', 'pledged', 'profile',
#        'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
#        'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
#        'usd_type', 'story', 'faq', 'num_faq', 'comments', 'n_comments',
#        'duration', 'parent_category', 'category_name', 'location_name',
#        'month_launched', 'year_launched', '', '',
#        '', '', '', '',
#        '', '', '', '']

In [10]:
X = dataset[numerical_features]
y = dataset.state
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

numerical_logreg = LogisticRegression()
numerical_logreg.fit(X_train, y_train)
y_pred_class = numerical_logreg.predict(X_test)
y_pred_class.sum()
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
print('Confusion Matrix: \n', metrics.confusion_matrix(y_test, y_pred_class))
pd.DataFrame(
        [x for x in zip(list(X_train.columns) + [str(x) for x in range(len(X_train.columns))], numerical_logreg.coef_[0])], 
        columns=['features', 'coefficients']
    ).sort_values(
        by = 'coefficients', 
        ascending = False, 
        key = lambda x: abs(x)
        )

Accuracy:  0.8866396761133604
Confusion Matrix: 
 [[ 79  22]
 [  6 140]]


Unnamed: 0,features,coefficients
2,backers_count,0.048901
11,story_afinn,-0.015452
5,pledged_log,0.013878
3,backers_count_log,0.012427
4,goal_log,-0.00934
13,blurb_name_afinn,-0.008537
14,blurb_afinn,-0.008341
6,duration_log,-0.006152
9,comments_afinn,0.005985
7,n_comments_log,0.004979


In [32]:
from sklearn import metrics
vect = CountVectorizer()
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    
    # create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # Fit Log reg
    logreg = LogisticRegression()
    logreg.fit(X_train_dtm, y_train)
    y_pred_class = logreg.predict(X_test_dtm)
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
    print('Confusion Matrix: ', metrics.confusion_matrix(y_test, y_pred_class))
    print('Classification Report: ', metrics.classification_report(y_test, y_pred_class))
    print(X_train_dtm.shape)
    table = pd.DataFrame(X_train_dtm.toarray(), columns=vect.get_feature_names())
    print(table.shape)

In [33]:
for feature in text_features:
    print(f"Feature is {feature}")
    X = dataset[feature]
    y = dataset.state

    # split X and y into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    tokenize_test(vect)
    print('\n')

Feature is blurb
Accuracy:  0.6275303643724697
Confusion Matrix:  [[ 29  72]
 [ 20 126]]
Classification Report:                precision    recall  f1-score   support

           0       0.59      0.29      0.39       101
           1       0.64      0.86      0.73       146

    accuracy                           0.63       247
   macro avg       0.61      0.58      0.56       247
weighted avg       0.62      0.63      0.59       247

(738, 4239)
(738, 4239)


Feature is name
Accuracy:  0.6072874493927125
Confusion Matrix:  [[ 20  81]
 [ 16 130]]
Classification Report:                precision    recall  f1-score   support

           0       0.56      0.20      0.29       101
           1       0.62      0.89      0.73       146

    accuracy                           0.61       247
   macro avg       0.59      0.54      0.51       247
weighted avg       0.59      0.61      0.55       247

(738, 2198)
(738, 2198)


Feature is story
Accuracy:  0.6882591093117408
Confusion Matrix:  [[ 