In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import os, json
import matplotlib.pyplot as plt 
import matplotlib as mpl
%matplotlib inline 
import seaborn as sns
import scipy.stats as stats
from scipy.stats import norm
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.tree import export_graphviz 
import graphviz

from sklearn.metrics import confusion_matrix
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
#func that reads the ''category'' column (from JSON format)
def CustomParser1(df):
    j1 = json.loads(df)
    return j1

This part loads as many files as instructed

In [None]:
def load_files (n):
    df=pd.DataFrame()
    read_df=pd.DataFrame()
    a=["%03d" % x for x in range(n)]
    for filenum in a:
        filename='Data/Kickstarter'+filenum+'.csv'
        read_df=pd.read_csv(filename,converters={'category':CustomParser1},header=0)
        df=pd.concat([df,read_df],ignore_index=True)
        
    return df
Filesnum=input('How many file should I load? 1..54  ')
df=load_files (int(Filesnum))
df['campaign_name']=df['name']
df.to_csv('data/data.csv')
df.info()

In [None]:
# df=pd.read_csv('Data/Kickstarter.csv',converters={'category':CustomParser1},header=0)
#make differnt columns out of ''category'' format
df[sorted(df['category'][0].keys(),reverse=False)] = df['category'].apply(pd.Series) 
df1=df[['category','color','parent_id','urls','id','name','position']]
df['category.parent_id']=df1['position']
df['category.id']=df1['color']
df['category.position']=df1['parent_id']
df['category.name']=df1['id']
df['category.slug']=df1['name']
#split ''slug'' and leaves just the main category name
df['category.slug']=df['category.slug'].apply(lambda x: x.split('/'))
df['category.slug']=df['category.slug'].apply(lambda x: x.pop(0))
df[['category','category.parent_id','category.id','category.name','category.position','category.slug']][:1]
df['name']=df['campaign_name']
df=df.drop(['campaign_name'],axis=1)
df.to_csv('data/data.csv')
df.info()

In [None]:
# func that reads the ''creator'' column (from JSON format).
#some of the cells cause problems 
    #for exemple- the cell JSON format include double apostrophes in nicknames like "Elad "Superman" Toister" confused it.

def CustomParser2(df2):
    try:
        j2 = json.loads(df2)
        return j2
    except: #the func pass all the errored rows and return 0 to the "creator" columnn. 
        return 0
    pass
            
df2=pd.read_csv('data/data.csv',converters={'creator':CustomParser2},header=0)
#count and collect all the droped rows- so we can know the "cost" of te dropping (and maybe i will succed to solve it in the future)
droped=df2.loc[df2['creator']==0,['creator']]
df2=df2.loc[df2['creator']!=0]
drop_list=list(droped.index)
#df['creator'].iloc[drop_list]=df['creator'].iloc[drop_list].apply(lambda x: x.replace(' ',',')) is a start of a solution
df=df.drop(index=drop_list)
print('droped rows:',len(drop_list))
print (len(df))
print (len(df2))


In [None]:
# 2 func that make diffent columns out of "creator" column (the auto func i used before don't works here. i did it manually)
df2['creator_name']=df2['creator'].apply(lambda x: x['name'])
df2['creator_id']=df2['creator'].apply(lambda x: x['id'])
#"inject" it back to the original df
df['creator_name']=df2['creator_name']
df['creator_id']=df2['creator_id']
df.info()

In [None]:
#cleaning func
def clean(df):
    data = df.copy()
    #this is important beacuse this is the part we decide  which columns entered the data set.
    #the main structure is like in the exemple but i manipulate and add some additional columns i think we need to include(*marked) . 
    selected_cols = ['creator_name',
                     'name',
                     'creator_id', #*
                     'backers_count',
                     'blurb',
                     'is_starred', #*
                     'category.id', #*
                     'category.name',
                     'category.parent_id',
                     'category.slug',
                     'country',
                     'created_at',
                     'currency',  
                     'deadline',
                     'goal',
                     'launched_at',
                     'staff_pick',
                     'state',
                     'usd_pledged',
                     'usd_type']
    data = data[selected_cols]
    data['is_starred']=data['is_starred'].replace({1: True , None: False})
    data = data.dropna()
    successful = data['state'] == "successful"
    failed = data['state'] == "failed"
    cancelled = data['state'] == "cancelled"
    suspended = data['state'] == "suspended"
    data = data.loc[failed | successful | cancelled | suspended]
    num_cols = ['usd_pledged',
                'deadline',
                'created_at',
                'launched_at']
    data[num_cols] = data[num_cols].apply(pd.to_numeric, errors='coerce')
    data['created_at'] = pd.to_datetime(data['created_at'],unit='s')
    data['launched_at'] = pd.to_datetime(data['launched_at'],unit='s')
    data['deadline'] = pd.to_datetime(data['deadline'],unit='s')
    return data

data = clean(df)


In [None]:
# We will create data frames containing only single main category

categories=data['category.slug'].unique()
frames = {}
for ct in categories:
    frames[ct] = data[data['category.slug'] == ct]
#We will use Progressbar to track progress as it istime consuming operation
import pyprind
pbar = pyprind.ProgBar(331675)


def getElementsInRange(cat,end,week):
    '''Get number of launched projects in given range from (end - week) to end'''
    global pbar
    pob = frames[cat]
    start = end - pd.DateOffset(weeks = week)
    # as we sorted our projects by launch date earlier geting number of projects in given date range is easy
    value = pob['launched_at'].searchsorted(end)[0] - pob['launched_at'].searchsorted(start)[0]
    pbar.update()
    return value
# Number of projects in same category for last week    
data['Last_Week'] = data.apply(lambda x: getElementsInRange(x['category.slug'],x['launched_at'],1),axis = 1) 

In [None]:
pbar = pyprind.ProgBar(331675)
# Number of projects in same category for last month    
data['Last_Month'] = data.apply(lambda x: getElementsInRange(x['category.slug'],x['launched_at'],4),axis = 1) 

pbar = pyprind.ProgBar(331675)
# Number of projects in same category for last year    
data['Last_Year'] =data.apply(lambda x: getElementsInRange(x['category.slug'],x['launched_at'],52),axis = 1) 

pbar = pyprind.ProgBar(331675)

data['Last_3_Month'] = data.apply(lambda x: getElementsInRange(x['category.slug'],x['launched_at'],13),axis = 1)

pbar = pyprind.ProgBar(331675)
  
data['Last_6_Month'] = data.apply(lambda x: getElementsInRange(x['category.slug'],x['launched_at'],26),axis = 1)
data.info()

In [None]:
def engineer_features(data):
    #make state 1 or 0
    data['state'].replace('suspended','failed',inplace=True)   
    data['state_num'] = data['state'].apply(lambda x: 1 if x=='successful' else 0)
    #time to reletive time
    data['launched_at_hr'] = data['launched_at'].apply(lambda x: x.hour) + 1
    data['launched_at_day_in_week'] = data['launched_at'].apply(lambda x: x.dayofweek + 1)
    data['launched_at_day_in_month'] = data['launched_at'].apply(lambda x: x.day ) #Elad's comment
    data['launched_at_mo'] = data['launched_at'].apply(lambda x: x.month)
    data['launched_at_yr'] = data['launched_at'].apply(lambda x: x.year)
    data['deadline_hr'] = data['deadline'].apply(lambda x: x.hour) + 1
    data['deadline_day_in_week'] = data['deadline'].apply(lambda x: x.dayofweek + 1) 
    data['deadline_day_in_month'] = data['deadline'].apply(lambda x: x.day ) #Elad's comment
    data['deadline_mo'] = data['deadline'].apply(lambda x: x.month)
    data['deadline_yr'] = data['deadline'].apply(lambda x: x.year)
    data['created_at_hr'] = data['created_at'].apply(lambda x: x.hour) + 1
    data['created_at_day_in_week'] = data['created_at'].apply(lambda x: x.dayofweek + 1) 
    data['created_at_day_in_month'] = data['created_at'].apply(lambda x: x.day )  #Elad's comment
    data['created_at_mo'] = data['created_at'].apply(lambda x: x.month)
    data['created_at_yr'] = data['created_at'].apply(lambda x: x.year)
    data['count'] = 1
    #data['success'] = (data['state'] == 'successful')
    data['launched-created'] = (data.launched_at - data.created_at).dt.components.days
    data['deadline-launched'] = (data.deadline - data.launched_at).dt.components.days
    data=data.drop(['launched_at','created_at','deadline','state'],axis=1) #drop original time col
    data['pledge_perc']=data['usd_pledged']/data['goal']*100
   
    return data
data = engineer_features(data)
data.info()


In [None]:
kk = pd.DataFrame()
# length of the name
data['name_len'] = data.name.str.len()
# if name contains a question mark
data['name_is_question'] = (data.name.str[-1] == '?').astype(int)
# if name contains an exclamation mark
data['name_is_exclamation'] = (data.name.str[-1] == '!').astype(int)
# if name is uppercase
data['name_is_upper'] = data.name.str.isupper().astype(float)
def count_non_character(row):
    '''Number of non character in the sentence'''
    return sum((0 if c.isalpha() else 1 for c in str(row)))
# number of non character in the name
data['name_non_character'] = data.name.apply(count_non_character)
# number of words in the name
data['name_number_of_word'] = data.name.apply(lambda x: len(str(x).split(' ')))
# We generate new feature based on ratio between vowels and other alpha characters
def countVowelstoLettersRatio(s):
    '''Count ratio between vowels and letters'''
    s = str(s)
    count = 1  
    vowels = 0
    for i in s:
        if i.isalpha():
            count = count + 1
            if i in 'aeiou':
                vowels = vowels + 1
    return ((vowels * 1.0) / count)

# for each name calculate vowels ratio
data['name_vowel_ratio'] = data.name.apply(countVowelstoLettersRatio)

#blurb
data['blurb_number_of_word'] = data.blurb.apply(lambda x: len(str(x).split(' ')))
data['blurb_vowel_ratio'] = data.blurb.apply(lambda x: len(str(x).split(' ')))
data['blurb_non_character'] = data.blurb.apply(count_non_character)

#goal split in 1000\500\10?
data['goal_1000'] = data.goal.apply(lambda x: x // 1000)
data['goal_500'] = data.goal.apply(lambda x: x // 500)
data['goal_10'] = data.goal.apply(lambda x: x // 10)
data.to_csv('Data/data.csv')

data.info(10)

# for creating a csv for the visualization:
### sdata.csv=small dataset
### sdata.csv=big dataset

### run twice and then save the dataset for further handelling

In [None]:
data=pd.read_csv('Data/data.csv')
kk = pd.DataFrame()
# length of the name
data['name_len'] = data.name.str.len()
# if name contains a question mark
data['name_is_question'] = (data.name.str[-1] == '?').astype(int)
# if name contains an exclamation mark
data['name_is_exclamation'] = (data.name.str[-1] == '!').astype(int)
# if name is uppercase
data['name_is_upper'] = data.name.str.isupper().astype(float)
def count_non_character(row):
    '''Number of non character in the sentence'''
    return sum((0 if c.isalpha() else 1 for c in str(row)))
# number of non character in the name
data['name_non_character'] = data.name.apply(count_non_character)
# number of words in the name
data['name_number_of_word'] = data.name.apply(lambda x: len(str(x).split(' ')))
# We generate new feature based on ratio between vowels and other alpha characters
def countVowelstoLettersRatio(s):
    '''Count ratio between vowels and letters'''
    s = str(s)
    count = 1  
    vowels = 0
    for i in s:
        if i.isalpha():
            count = count + 1
            if i in 'aeiou':
                vowels = vowels + 1
    return ((vowels * 1.0) / count)

# for each name calculate vowels ratio
data['name_vowel_ratio'] = data.name.apply(countVowelstoLettersRatio)

#blurb
data['blurb_number_of_word'] = data.blurb.apply(lambda x: len(str(x).split(' ')))
data['blurb_vowel_ratio'] = data.blurb.apply(lambda x: len(str(x).split(' ')))
data['blurb_non_character'] = data.blurb.apply(count_non_character)

#goal split in 1000\500\10?
data['goal_1000'] = data.goal.apply(lambda x: x // 1000)
data['goal_500'] = data.goal.apply(lambda x: x // 500)
data['goal_10'] = data.goal.apply(lambda x: x // 10)
data=data.drop(['Unnamed: 0'],axis=1)
#change the name for differnt sizes
data.to_csv('Data/bdata.csv')
data.info(10)

In [None]:
list(data.columns)

# for creating a ML csv:
### Sdata_for_ML.csv=small dataset
### Bdata_for_ML.csv=big dataset

### run twice and then save the dataset for further handelling

In [None]:
dummies=['category.name', 'category.slug','country', 'currency','launched_at_day_in_week','deadline_day_in_week']
mod1_columns=[ 'is_starred', 'category.name', 'category.slug',
       'country', 'currency', 'goal','usd_pledged','pledge_perc', 'staff_pick', 'launched_at_hr',
       'launched_at_day_in_week', 'launched_at_day_in_month', 'launched_at_mo',
       'launched_at_yr', 'deadline_hr', 'deadline_day_in_week',
       'deadline_day_in_month', 'deadline_mo', 'deadline_yr', 'created_at_hr',
       'created_at_day_in_week', 'created_at_day_in_month', 'created_at_mo',
       'created_at_yr', 'launched-created','goal_1000','goal_500','goal_10','Last_Week','Last_Month',
       'Last_Year','Last_3_Month','Last_6_Month',
       'deadline-launched','state_num','name_len','name_is_question','name_is_exclamation','name_is_upper','name_non_character','name_number_of_word','name_vowel_ratio','blurb_number_of_word','blurb_vowel_ratio','blurb_non_character']
data=data[mod1_columns]
data = pd.get_dummies(data, columns=dummies)
data["deadline-launched"]=np.log(data["deadline-launched"])
data['goal']=np.log(data['goal'])
#change the name for differnt sizes
data.to_csv('data/Bdata_for_ML.csv')  
data.info()


# click ''Run all above" <--

# future

In [None]:
for x in categories:
    y=data.loc[data['category.slug']==x]
    print(x,y[['Last_Week','Last_Month','Last_Year']].mean())

def getRangeMean(cat,end,week):
    global pbar
    pob = frames[cat]
    start = end - pd.DateOffset(weeks = week)
    value = pob.iloc[pob['launched_at'].searchsorted(start)[0]:pob['launched_at'].searchsorted(end)[0]]['goal'].mean()
    pbar.update()
    return value
pbar = pyprind.ProgBar(331675)
# Mean goal for category last month
data['mean_goal_in_category_last_month'] = data.apply(lambda x: getRangeMean(x['category.slug'],x['launched_at'],4),axis = 1) 

def getRangeMedian(cat,end,week):
    global pbar
    pob = frames[cat]
    start = end - pd.DateOffset(weeks = week)
    value = pob.iloc[pob['launched_at'].searchsorted(start)[0]:pob['launched_at'].searchsorted(end)[0]]['goal'].median()
    pbar.update()
    return value

pbar = pyprind.ProgBar(331675)
# Median goal for category last month
data['median_goal_in_category_last_month'] = data.apply(lambda x: getRangeMedian(x['category.slug'],x['launched_at'],4),axis = 1)

pbar = pyprind.ProgBar(331675)
# Mean goal for category last month
data['mean_goal_in_category_last_year'] = data.apply(lambda x: getRangeMean(x['category.slug'],x['launched_at'],52),axis = 1) 


pbar = pyprind.ProgBar(331675)
# Median goal in category last month
data['median_goal_in_category_last_year'] = data.apply(lambda x: getRangeMedian(x['category.slug'],x['launched_at'],52),axis = 1) 

data['median_goal_Last_6_Month'] = data.apply(lambda x: getRangeMedian(x['category.slug'],x['launched_at'],26),axis = 1)

data['mean_goal_Last_6_Month'] = data.apply(lambda x: getRangeMean(x['category.slug'],x['launched_at'],26),axis = 1)
data['mean_goal_Last_Week'] = data.apply(lambda x: getRangeMean(x['category.slug'],x['launched_at'],1),axis = 1)
data['median_goal_Last_Week'] = data.apply(lambda x: getRangeMedian(x['category.slug'],x['launched_at'],1),axis = 1)
data.info()