In [13]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy.sparse import vstack, hstack
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

In [14]:
dataset = pd.read_csv('Output/data_clean&processed_addedsentiment.csv')
# dataset = dataset.drop('Unnamed: 0', axis = 1)

dataset.columns

Index(['id_row', 'backers_count', 'blurb', 'category',
       'converted_pledged_amount', 'country', 'created_at', 'current_currency',
       'deadline', 'fx_rate', 'goal', 'id', 'launched_at', 'location', 'name',
       'pledged', 'profile', 'slug', 'source_url', 'spotlight', 'staff_pick',
       'state', 'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type', 'story', 'faq', 'num_faq', 'comments', 'n_comments',
       'duration', 'parent_category', 'category_name', 'location_name',
       'month_launched', 'year_launched', 'backers_count_log', 'goal_log',
       'pledged_log', 'duration_log', 'n_comments_log', 'num_faq_bool',
       'avg_fund_per_backer', 'comments_afinn', 'blurb_name', 'faq_comments',
       'story_afinn', 'faq_comments_afinn', 'blurb_name_afinn', 'blurb_afinn',
       'name_afinn'],
      dtype='object')

In [15]:
numerical_features = ['goal','converted_pledged_amount','backers_count','backers_count_log','goal_log','pledged_log','duration_log', 'avg_fund_per_backer', 'blurb_afinn','name_afinn']

In [16]:
X = dataset[numerical_features]
y = dataset.state
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


numerical_logreg = LogisticRegression()
numerical_logreg.fit(X_train, y_train)
y_pred_class = numerical_logreg.predict(X_test)
y_pred_class.sum()
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
print('Confusion Matrix: \n', metrics.confusion_matrix(y_test, y_pred_class))
pd.DataFrame(
        [x for x in zip(list(X_train.columns) + [str(x) for x in range(len(X_train.columns))], numerical_logreg.coef_[0])], 
        columns=['features', 'coefficients']
    ).sort_values(
        by = 'coefficients', 
        ascending = False, 
        key = lambda x: abs(x)
        )

Accuracy:  0.9157894736842105
Confusion Matrix: 
 [[281  31]
 [ 41 502]]


Unnamed: 0,features,coefficients
5,pledged_log,0.752619
3,backers_count_log,0.721604
4,goal_log,-0.515935
6,duration_log,-0.302962
8,blurb_afinn,-0.136756
9,name_afinn,-0.086673
2,backers_count,0.012102
7,avg_fund_per_backer,0.000727
1,converted_pledged_amount,0.000367
0,goal,-9e-06


In [17]:
X = dataset[numerical_features]
y = dataset.state
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

c_values = [100, 10, 1.0, 0.5, 0.1, 0.01]

numerical_logreg = LogisticRegression(random_state=42, solver='liblinear', penalty='l1', dual=False, tol=0.0001, C=0.01, fit_intercept=True, intercept_scaling=1.0, class_weight=None)
numerical_logreg.fit(X_train, y_train)
y_pred_class = numerical_logreg.predict(X_test)
y_pred_class.sum()
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
print('Confusion Matrix: \n', metrics.confusion_matrix(y_test, y_pred_class))
pd.DataFrame(
        [x for x in zip(list(X_train.columns) + [str(x) for x in range(len(X_train.columns))], numerical_logreg.coef_[0])], 
        columns=['features', 'coefficients']
    ).sort_values(
        by = 'coefficients', 
        ascending = False, 
        key = lambda x: abs(x)
        )

Accuracy:  0.9461988304093567
Confusion Matrix: 
 [[267  45]
 [  1 542]]


Unnamed: 0,features,coefficients
4,goal_log,-0.94788
3,backers_count_log,0.849272
5,pledged_log,0.795388
8,blurb_afinn,-0.024368
2,backers_count,0.023356
7,avg_fund_per_backer,0.001126
1,converted_pledged_amount,-1.1e-05
0,goal,-8e-06
6,duration_log,0.0
9,name_afinn,0.0


In [18]:
import pickle

f = open('logreg.pckl', 'wb')
pickle.dump(numerical_logreg, f)
f.close()