In [1]:
#import linear algebra and data manipulation libraries
import numpy as np
import pandas as pd

from datetime import datetime

#import standard visualization
import matplotlib.pyplot as plt
import seaborn as sns
import mlxtend
from mlxtend.plotting import plot_confusion_matrix
from mlxtend.preprocessing import  minmax_scaling

#import machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC as svm

from sklearn.neural_network import MLPClassifier 

from sklearn.model_selection import train_test_split #split
#import metrics
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score, roc_auc_score 

  import pandas.util.testing as tm


# https://www.kaggle.com/kemical/kickstarter-projects

**Columns**<br />

**ID** - internal kickstarter id<br />
**name** - name of project - A project is a finite work with a clear goal that you’d like to bring to life. Think albums, books, or films.<br />
**category** - category<br />
**main_category** - category of campaign<br />
**currency** - currency used to support<br />
**deadline** - deadline for crowdfunding<br />
**goal** - fundraising goal - The funding goal is the amount of money that a creator needs to complete their project.<br />
**launched** - date launched<br />
**pledged** - amount pledged by "crowd"<br />
**state** - Current condition the project is in<br />
**backers** - number of backers<br />
**country** - country pledged from<br />
**usd pledged** - amount of money pledged<br />
**goal** - amount in USD<br />

In [2]:
df = pd.read_csv('kickstarter-projects/ks-projects-201801.csv', parse_dates=['deadline', 'launched'])
df.head(20)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0
5,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01,50000.0,2016-02-26 13:38:27,52375.0,successful,224,US,52375.0,52375.0,50000.0
6,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21,1000.0,2014-12-01 18:30:44,1205.0,successful,16,US,1205.0,1205.0,1000.0
7,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17,25000.0,2016-02-01 20:05:12,453.0,failed,40,US,453.0,453.0,25000.0
8,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Product Design,Design,USD,2014-05-29,125000.0,2014-04-24 18:14:43,8233.0,canceled,58,US,8233.0,8233.0,125000.0
9,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Documentary,Film & Video,USD,2014-08-10,65000.0,2014-07-11 21:55:48,6240.57,canceled,43,US,6240.57,6240.57,65000.0


In [3]:
#Creating new feature project days 
df['project_days'] = df['deadline'] - df['launched']

df['project_days'] = pd.to_numeric(df['project_days'].dt.days, downcast='integer')

In [4]:
#drop values where project days is less than 92
df = df[df['project_days'] <=92]

In [5]:
# Keeping only two values in state column
df = df.loc[df['state'].isin(['failed','successful'])]

targ_dict = {'failed': 0,
             'successful': 1 
            }

df['state'] = df['state'].map(targ_dict)

In [6]:
#Need to drop values "N,0" in country column
df['country'].unique()
df = df[df['country'] !='N,0"']

In [7]:
#Creating new features - day and moth where project was created
df = df.assign(launched_project_day=df.launched.dt.day,
               launched_project_month=df.launched.dt.month)
               

In [8]:
#Combining values in country column and assigning values to US and Others
countries_without_US = ['GB', 'CA', 'AU', 'DE', 'FR', 'NL', 'IT', 'ES', 'SE', 'MX', 'NZ', 'DK', 'IE', 'HK', 'NO',
                        'CH', 'SG', 'BE', 'AT', 'LU', 'JP']
df.country.replace(countries_without_US, 'Others', inplace=True)
df['country_US'] = np.where(df.country=='US', 1, 0)

In [9]:
# Filtering dataframe to remove outliers
df = df[((df['usd_pledged_real'] - df['usd_pledged_real'].mean()) / df['usd_pledged_real'].std()).abs() < .099]

In [10]:
# Assigning new values to main_category column
digit_weight = {'Games': 382.5,
                'Design':274.6,
                'Technology':186.5,
                'Comics': 145.6,
                'Film & Video':72.6,
                'Fashion':68.5,
                'Publishing': 61.9,   
                'Food': 59.0,
                'Music':58.2, 
                'Theater': 49.2,
                'Art': 45.5,
                'Dance': 44.8,
                'Photography':43.3,
                'Journalism': 43.2, 
                'Crafts': 29.9}     
 
df['main_category'] = df['main_category'].map(digit_weight)

In [11]:
 #######MENI TSE NE NRAVITSYA#######
    
    
    
    
    
# digital_weight_combined = ['Film & Video', 'Fashion', 'Publishing', 'Food', 'Music', 'Theater', 'Art', 'Dance',
#                            'Photography', 'Journalism', 'Crafts']
# df.main_category.replace(digital_weight_combined, 'Others', inplace=True)

# digit_weight = {'Games': 382.5,
#                 'Design':274.6,
#                 'Technology':186.5,
#                 'Comics': 145.6,
#                 'Others': 576.1}  


 
# df['main_category'] = df['main_category'].map(digit_weight)

In [12]:
#Dropping unnecessary columns
cols_to_drop= ['ID','name','category','currency', 'deadline', 'goal',
          'launched', 'pledged', 'backers', 'usd pledged', 'usd_pledged_real', 'country']
#dropping list of columns from final_df
df.drop(cols_to_drop, axis=1, inplace=True)

In [13]:
df

Unnamed: 0,main_category,state,usd_goal_real,project_days,launched_project_day,launched_project_month,country_US
1,72.6,0,30000.00,59,2,9,1
6,59.0,1,1000.00,19,1,12,1
7,59.0,0,25000.00,44,1,2,1
11,58.2,1,12500.00,29,9,3,1
15,274.6,0,2500.00,29,29,1,1
...,...,...,...,...,...,...,...
378646,61.9,1,950.00,42,18,10,1
378647,61.9,0,4999.00,29,15,8,1
378651,58.2,1,4529.81,29,20,2,0
378652,72.6,1,2675.19,29,29,3,0


In [14]:
#Scaling data using min_max)scaling
df.usd_goal_real = minmax_scaling(df.usd_goal_real, columns = [0])
df.main_category = minmax_scaling(df.main_category, columns = [0])


In [15]:
#Saving our dataframe
df.to_csv('prepared_df.csv')

In [16]:
df.state.value_counts()

1    104561
0     61978
Name: state, dtype: int64

In [23]:
#Splitting our data to train, test, validation
X_train, X_test, y_train, y_test = train_test_split(df.drop('state', axis=1), df['state'], 
                                                    test_size=.20, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.1, random_state=42)

In [19]:
print(f'Train size: {X_train.shape}')
print(f'Test size: {X_test.shape}')
print(f'Validation size: {X_val.shape}')


Train size: (119907, 6)
Test size: (33308, 6)
Validation size: (13324, 6)


In [20]:
%%time 
#using the time function to see how long it takes each classifier to run

# declare a logistic regression classifier
lr = LogisticRegression(penalty='l2', solver='liblinear') #liblinear supports l2 regularization

# Fit the model
fit = lr.fit(X_train, y_train)

pred_y = lr.predict(X_test)


lrs = round(lr.score(X_test, y_test)*100,2) #format accuracy score
print('\nAccuracy Percentage:', lrs)

#y_score = logreg.fit(X_res, y_res).decision_function(X_res)
#y_pred = logreg.fit(X_res, y_res).predict(X_test)


Accuracy Percentage: 64.03
Wall time: 637 ms


In [21]:
print('\n\n\n', classification_report(y_test, pred_y))




               precision    recall  f1-score   support

           0       0.53      0.22      0.31     12296
           1       0.66      0.89      0.76     21012

    accuracy                           0.64     33308
   macro avg       0.60      0.55      0.53     33308
weighted avg       0.61      0.64      0.59     33308



In [22]:
y_pred_1 = lr.predict(X_val)

probability = lr.predict_proba(X_val)

print(classification_report(y_val, y_pred_1))

ACC_valid_KNN = accuracy_score(y_val, y_pred_1)
print(ACC_valid_KNN)

print(roc_auc_score(y_val, y_pred_1))


              precision    recall  f1-score   support

           0       0.52      0.22      0.31      4894
           1       0.66      0.88      0.76      8430

    accuracy                           0.64     13324
   macro avg       0.59      0.55      0.53     13324
weighted avg       0.61      0.64      0.59     13324

0.6402731912338637
0.5518870759993233
