In [1]:
!pip install vaderSentiment

import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



# Hard encode Input Dictionary

In [2]:
input = {
    "name": "terrible MegaBuster from Megaman X",
    "goal": 10000,
    "launched": "2015-08-11",
    "deadline": "2015-08-18",
    "backers":21,
    "main_category": 11,
    "username": "LoginID"
}

# Make a function that takes in input dict and converts to dataframe

In [3]:
def framemaker(web_in):
# making dataframe out of dict  
  input_frame = pd.DataFrame(web_in, index=[0])

# changing datatype of start and end to date time
# adding column length of campaign
  input_frame['deadline'] = pd.to_datetime(input_frame['deadline'])
  input_frame['launched'] = pd.to_datetime(input_frame['launched'])
  input_frame['length_of_campaign'] = (input_frame['deadline'] - input_frame['launched']).dt.days

# Using a pretrained neural network to encode title to numbers
# Adding numbers to column as sentiments
  sentiments =[] 
  analyzer = SentimentIntensityAnalyzer()
  for sentence in input_frame['name']:
    vs = analyzer.polarity_scores(sentence)
    sentiments.append(vs['compound'])
  input_frame['sentiments'] = sentiments
  
  # input_frame['goal'] = (input_frame['goal'].str.split()).apply(lambda x: float(x[0].replace(',', '')))
  # input_frame['backers']= input_frame['backers'].astype(str).astype(int)

  # Dropping unecessary username column
  input_frame = input_frame.drop('username', axis=1)
  input_frame = input_frame.drop('name', axis=1)
  input_frame = input_frame.drop('launched', axis=1)
  input_frame = input_frame.drop('deadline', axis=1)

  input_frame = input_frame[['goal', 'backers', 'length_of_campaign', 'sentiments', 'main_category']]

  userinput = input_frame.iloc[[0]]

  return userinput 

In [4]:
user_input = framemaker(input)

In [5]:
user_input

Unnamed: 0,goal,backers,length_of_campaign,sentiments,main_category
0,10000,21,7,-0.4767,11


# Make function that takes in dataframe, uses model, and can make a prediction

In [6]:
!pip install category_encoders==2.*
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder




  import pandas.util.testing as tm


In [22]:
df = pd.read_csv('cleaned_kickstarter_data.csv')

print(df.shape)
df

(999, 9)


Unnamed: 0,name,main_category,deadline,launched,goal,backers,length_of_campaign,project_success,sentiments
0,The Songs of Adelaide & Abullah,1,2015-10-09 11:36:00,2015-08-11 12:12:28,1000.0,0,58,0,0.0000
1,Where is Hank?,2,2013-02-26 00:20:50,2013-01-12 00:20:50,45000.0,3,45,0,0.0000
2,ToshiCapital Rekordz Needs Help to Complete Album,3,2012-04-16 04:24:11,2012-03-17 03:24:11,5000.0,1,30,0,0.4019
3,Community Film Project: The Art of Neighborhoo...,2,2015-08-29 01:00:00,2015-07-04 08:35:03,19500.0,14,55,0,0.0000
4,Monarch Espresso Bar,4,2016-04-01 13:38:27,2016-02-26 13:38:27,50000.0,224,35,1,0.0000
...,...,...,...,...,...,...,...,...,...
994,The 1st Motion Tracking DIY Smart Home Securit...,13,2016-12-31 03:54:32,2016-11-01 02:54:32,25000.0,397,60,1,0.6249
995,Veterans,12,2012-08-15 06:00:00,2012-07-09 05:39:06,5000.0,87,37,1,0.0000
996,MY VERY FIRST KICKSTARTER,2,2014-08-12 20:08:35,2014-07-13 20:08:35,6500.0,2,30,0,0.0000
997,This Song Is About You,3,2014-07-11 23:35:00,2014-06-11 23:35:00,40000.0,0,30,0,0.0000


In [8]:
def model_maker():
  train, test = train_test_split(df, train_size=0.80, test_size=0.20, 
                                 stratify=df['project_success'], random_state=42)
  # select our target 
  target = 'project_success'

  # make train without our target or id
  train_features = train.drop(columns=[target])

  # make numeric features
  numeric_features = train_features.select_dtypes(include='number').columns.tolist()

  # make a cardinality feature to help filter
  cardinality = train_features.select_dtypes(exclude='number').nunique()

  # get a list of relevant categorical data
  categorical_features = cardinality[cardinality <=50].index.tolist()

  # Combine the lists 
  features = numeric_features + categorical_features

  X_train = train[features]
  y_train = train[target]
  X_test = test[features]
  y_test = test[target]
  # print(features)
  # print(X_train.shape, X_test.shape)

  lrmodel = Pipeline([
                  ('ohe', OneHotEncoder(use_cat_names=True)),
                  ('scaler', StandardScaler()),  
                  ('impute', SimpleImputer()),
                  ('classifier', LogisticRegressionCV())
                  ])
  lrmodel.fit(X_train, y_train)

  return lrmodel

lrmodel = model_maker()

def predict(user_input):

  if lrmodel.predict(user_input) == 1:
    predict = {'predict': 'Your Kickstarter project is likely to succeed!'}
    input.update(predict)
    return input
  else:
    predict = {'predict':'Your Kickstarter project is likely to fail.'}
    input.update(predict)
    return input

In [9]:
predict(user_input)

{'backers': 21,
 'deadline': '2015-08-18',
 'goal': 10000,
 'launched': '2015-08-11',
 'main_category': 11,
 'name': 'terrible MegaBuster from Megaman X',
 'predict': 'Your Kickstarter project is likely to succeed!',
 'username': 'LoginID'}

In [10]:
# print(X_train)
  # print('training accuracy:', lrmodel.score(X_train, y_train))
  # print('test accuracy:', lrmodel.score(X_test, y_test))

In [11]:
# dfcat = df.filter(['main_category', 'project_success', 'goal'])


In [12]:
# suc_filt = [1]
# dfcat= dfcat[dfcat['project_success'].isin(suc_filt)]
# dfcat['project_success'] = dfcat['project_success'].replace({1: 'successful'})

In [35]:
# dfcat.describe()

Unnamed: 0,goal
count,395.0
mean,11485.037975
std,35528.057038
min,1.0
25%,1425.0
50%,4000.0
75%,10000.0
max,600000.0


In [14]:
# dfcat['main_category'].value_counts()

2     69
3     68
11    42
7     34
5     30
1     27
13    23
4     23
9     18
10    17
8     15
12    12
14     9
15     4
6      4
Name: main_category, dtype: int64

In [15]:
# dfcat['main_category'] = dfcat['main_category'].replace(1, 'Publishing')
# dfcat['main_category'] = dfcat['main_category'].replace(2, 'Film & Video')
# dfcat['main_category'] = dfcat['main_category'].replace(3, 'Music')
# dfcat['main_category'] = dfcat['main_category'].replace(4, 'Food')
# dfcat['main_category'] = dfcat['main_category'].replace(5, 'Design')
# dfcat['main_category'] = dfcat['main_category'].replace(6, 'Crafts')
# dfcat['main_category'] = dfcat['main_category'].replace(7, 'Games')
# dfcat['main_category'] = dfcat['main_category'].replace(8, 'Comics')
# dfcat['main_category'] = dfcat['main_category'].replace(9, 'Fashion')
# dfcat['main_category'] = dfcat['main_category'].replace(10, 'Theater')
# dfcat['main_category'] = dfcat['main_category'].replace(11, 'Art')
# dfcat['main_category'] = dfcat['main_category'].replace(12, 'Photography')
# dfcat['main_category'] = dfcat['main_category'].replace(13, 'Technology')
# dfcat['main_category'] = dfcat['main_category'].replace(14, 'Dance')
# dfcat['main_category'] = dfcat['main_category'].replace(15, 'Journalism')

In [16]:
# import plotly.graph_objects as go

In [17]:
# import plotly.express as px



# fig = px.bar(dfcat, x="main_category", y="goal", color="main_category", title="Long-Form Input")
# fig.show()

In [18]:
# !pip install --upgrade plotly

Requirement already up-to-date: plotly in /usr/local/lib/python3.6/dist-packages (4.10.0)


In [19]:
# import plotly.express as px
# df = dfcat
# fig = px.bar(df, x="main_category", y="goal", color='main_category')
# fig.show()

In [20]:
# import plotly.express as px
# df = px.data.tips()
# fig = px.bar(df, x="sex", y="total_bill", color='time')
# fig.show()