In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle

In [2]:
kdata = pd.read_csv('kickstarter-2019-extracted.csv')
kdata.head()

Unnamed: 0,id,name,blurb,goal,pledged,state,deadline,state_changed_at,created_at,launched_at,country,currency,backers_count,creator_id,project_id,category_id,category_name,category_slug
0,1458932991,Louli à l’école,Un livre enfant pour l’apprentissage des émoti...,600.0,631.0,successful,1566744397,1566744397,1561554849,1561560397,FR,EUR,31,469036700,3761403,46,Children's Books,publishing/children's books
1,498799566,"Strange Wit, an original graphic novel about J...","The true biography of the historical figure, w...",12000.0,14740.63,successful,1442204367,1442204368,1438740713,1439612367,US,USD,403,1695121020,2051485,252,Graphic Novels,comics/graphic novels
2,1540722298,FAM - FIND A MOTIVE MOBILE APP,FAM is the new mobile app which combines event...,100000.0,11.0,failed,1534607034,1534607034,1528401202,1529423034,GB,GBP,2,67455724,3399670,332,Apps,technology/apps
3,621087138,"Destiny, NY - FINAL HOURS!",A graphic novel about two magical ladies in love.,20000.0,21799.0,successful,1478131200,1478131200,1473150836,1475565730,US,USD,406,248241887,2667503,252,Graphic Novels,comics/graphic novels
4,1760442719,Publishing Magus Magazine,We are publishing a magazine that focuses on t...,5000.0,10.0,failed,1317747868,1317747868,1310702133,1315155868,US,USD,1,1345074053,43087,49,Periodicals,publishing/periodicals


## Kickstarter data selection
1. explore kickstarter data
2. filter the data, only including the 'failed' and 'successful' examples
3. remove duplicated examples using project_id
4. (optional) create '0'/'1' label
5. clean up description, remove '\n', '\t', '\r' in the original description (for BERT)
6. add feature: length of the description
7. select features and save files.

In [3]:
kdata['state'].unique()

array(['successful', 'failed', 'canceled', 'live', 'suspended'],
      dtype=object)

In [4]:
kdata_ = kdata.loc[(kdata['state'] == 'successful') | (kdata['state'] == 'failed') & (kdata['country'] == 'US')].copy()

In [5]:
kdata_.shape

(176288, 18)

In [6]:
len(kdata_['project_id'].unique())

151943

In [7]:
kdata_ = kdata_.sort_values('project_id')

In [8]:
kdata_.drop_duplicates(subset='project_id', keep='first', inplace=True)
kdata_.shape

(151943, 18)

In [9]:
kdata_['state'].describe()

count         151943
unique             2
top       successful
freq          100517
Name: state, dtype: object

In [10]:
kdata_['goal'].describe()

count    1.519430e+05
mean     3.680867e+04
std      9.866319e+05
min      1.000000e-02
25%      1.500000e+03
50%      5.000000e+03
75%      1.200000e+04
max      1.000000e+08
Name: goal, dtype: float64

In [11]:
kdata_.loc[kdata['goal'] == 0.01]

Unnamed: 0,id,name,blurb,goal,pledged,state,deadline,state_changed_at,created_at,launched_at,country,currency,backers_count,creator_id,project_id,category_id,category_name,category_slug
170854,620302213,LOVELAND Round 6: A Force More Powerful,$1 a square inch in Detroit + super fun excite...,0.01,100.0,successful,1259906820,1259907306,1259129231,1259132089,US,USD,6,211945026,2017,20,Conceptual Art,art/conceptual art


In [12]:
# create '0' or '1' label for BERT
klabel = []
for i in kdata_['state']:
    if i == 'failed':
        klabel.append('0')
    else:
        klabel.append('1')
kdata_['label'] = klabel

In [13]:
# remove '\n', '\t', '\r' in each description
desc = list(kdata_['blurb'])

desc_clean = []
for s in desc:
    s = str(s).replace('\n', ' ')
    s = s.replace('\t', ' ')
    s = s.replace('\r', ' ')
    desc_clean.append(s)
    
kdata_['desc_clean'] = desc_clean

In [14]:
# add one feature: length of the description
desc_len = [len(re.findall(r'\w+', str(d))) for d in desc]
kdata_['desc_len'] = desc_len
len(desc_len)

151943

In [15]:
kdata_.head()

Unnamed: 0,id,name,blurb,goal,pledged,state,deadline,state_changed_at,created_at,launched_at,...,currency,backers_count,creator_id,project_id,category_id,category_name,category_slug,label,desc_clean,desc_len
40918,1435295287,graffiti shows and festivals in California,Graff shows in Bakersfield / California with ...,5000.0,11.0,failed,1429572000,1429572000,1425914764,1426181107,...,USD,1,368593535,19,23,Painting,art/painting,0,Graff shows in Bakersfield / California with ...,9
45941,170817041,Gentlemen's Fury,Gentlemen's Fury is a dark comedy about tennis...,50000.0,18443.0,failed,1429218605,1429218608,1425915258,1426626605,...,USD,100,810901157,38,292,Comedy,film & video/comedy,0,Gentlemen's Fury is a dark comedy about tennis...,25
77276,1409304825,The Influencers Movement #JOINTIM,TIM makes influencers marketing accessible for...,10000.0,11016.0,successful,1432114351,1432114352,1425915690,1429522351,...,EUR,130,1051624225,49,342,Web,technology/web,1,TIM makes influencers marketing accessible for...,17
10787,2135528160,Fresh Perspective - Fresh Gardening,#gardening #selfsustainability,300.0,0.0,failed,1427216036,1427216038,1425915728,1425920036,...,USD,0,1823248285,54,310,Farmer's Markets,food/farmer's markets,0,#gardening #selfsustainability,2
175168,405307432,Electronola - An electronic gumbo of New Orlea...,"UPDATE: Shannon Powell, Walter Payton, Lucien ...",4000.0,4100.6,successful,1244185140,1244185224,1240366270,1241192665,...,USD,114,11,63,38,Electronic Music,music/electronic music,1,"UPDATE: Shannon Powell, Walter Payton, Lucien ...",23


In [16]:
kdata_.shape[0] - round(kdata_.shape[0] * 0.9)

15194

In [17]:
# prepare the data for BERT classifier
# create .tsv file with textual feature only
cols = ['label', 'project_id', 'project_id', 'desc_clean', 'desc_clean']
ktext_only = kdata_.loc[:, cols]

# shuffle the data and split it into train, dev, and test set.
ktext_only = shuffle(ktext_only)

n_train = round(0.9 * ktext_only.shape[0])
n_test = int((ktext_only.shape[0] - n_train) / 2)
# print(n_test, n_train)

ktext_only_train = ktext_only.iloc[:n_train]
ktext_only_dev = ktext_only.iloc[n_train:(n_train + n_test)]
ktext_only_test = ktext_only.iloc[(n_train + n_test):]

ktext_only_train.to_csv('KICK/text_only_train.tsv', sep='\t', index=None)
ktext_only_dev.to_csv('KICK/text_only_dev.tsv', sep='\t', index=None)
ktext_only_test.to_csv('KICK/text_only_test.tsv', sep='\t', index=None)

## moved all three .tsv files to bert-classifier/glue-data/KICK

In [18]:
# create train/dev/test set for baseline and other models
select_col_list = ['desc_clean', 'desc_len', 'goal', 'category_slug', 'label']

def id_generate_set(id_df, orig_df, orig_id_col_name, select_col_list):
    get_id = id_df.iloc[:,1].values
    output_df = orig_df.loc[orig_df[orig_id_col_name].isin(get_id)]
    output_df = output_df[select_col_list]
    return output_df

train_df = id_generate_set(ktext_only_train, kdata_, 'project_id', select_col_list)
dev_df = id_generate_set(ktext_only_dev, kdata_, 'project_id', select_col_list)
test_df = id_generate_set(ktext_only_test, kdata_, 'project_id', select_col_list)

train_df.to_csv('KICK/textandmeta_train.tsv', sep='\t', index=None)
dev_df.to_csv('KICK/textandmeta_dev.tsv', sep='\t', index=None)
test_df.to_csv('KICK/textandmeta_test.tsv', sep='\t', index=None)

## Indiegogo data selection
1. explore kickstarter data
2. remove the examples with 0 balance
3. convert balance from string to number and save it as new column 'goal'
4. (optional) create '0'/'1' label
5. clean up description, remove '\n', '\t', '\r' in the original description (for BERT)
6. add feature: length of the description
7. select features and save files.

In [178]:
# load indiegogo data
idata = pd.read_csv('indiegogo-2017-extracted.csv')
idata.head()

Unnamed: 0,id,title,balance,tagline,card_type,category_slug,currency_code,in_forever_funding,cached_collected_pledges_count
0,2237863,Pi SDR for radio amateurs,"$1,570",A great solution for SDR1000 and Softrock/Ense...,project,tech-innovation,USD,False,6
1,2283716,Washy,£0,"Next generation of portable dishwashers, say b...",project,home,GBP,False,0
2,2286181,Magnetic Glowing Coasters,$3,We need your support in launching our new prod...,project,productivity,USD,False,1
3,2296821,Triplync - the ultimate travel app,$0,Trip booking service based on Ai that helps tr...,project,travel-outdoors,USD,False,0
4,2267514,INSHUS Simple Way,$0,putting on and taking off shoes has an end,project,fashion-wearables,USD,False,0


In [179]:
idata_ = idata.loc[(idata['currency_code'] == 'USD') & (idata['balance'] != '$0')].copy()

In [180]:
idata_['in_forever_funding'].describe()

count     211176
unique         2
top        False
freq      201034
Name: in_forever_funding, dtype: object

In [181]:
idata_['balance'].describe()

count     211176
unique     15804
top         $500
freq         832
Name: balance, dtype: object

In [182]:
bal = list(idata_['balance'])

In [183]:
int(bal[1][1:].replace(',',''))

3

In [184]:
bal_int = [int(x[1:].replace(',','')) for x in bal]

In [185]:
idata_['goal'] = bal_int

In [186]:
idata_.head()

Unnamed: 0,id,title,balance,tagline,card_type,category_slug,currency_code,in_forever_funding,cached_collected_pledges_count,goal
0,2237863,Pi SDR for radio amateurs,"$1,570",A great solution for SDR1000 and Softrock/Ense...,project,tech-innovation,USD,False,6,1570
2,2286181,Magnetic Glowing Coasters,$3,We need your support in launching our new prod...,project,productivity,USD,False,1,3
9,2024227,TOC - GOOD VIBES,$162,TOC is a revolutionary device that helps babie...,project,travel-outdoors,USD,False,3,162
10,2278591,Selsabel Hair Care & Advanced Organic Formula,$44,"Unique Hair Care for All Hair Types, Optimizes...",project,health-fitness,USD,False,1,44
12,1902423,Dubblecup: Ceramic Double Cup,$170,"Microwave-Safe, Dishwasher-Safe, Baller-Tier D...",project,home,USD,False,6,170


In [187]:
idata_['goal'].describe()

count    2.111760e+05
mean     2.186964e+04
std      1.705807e+05
min      1.000000e+00
25%      6.710000e+02
50%      1.997500e+03
75%      6.218000e+03
max      1.328835e+07
Name: goal, dtype: float64

In [188]:
# remove '\n', '\t', '\r' in each description
desc = list(idata_['tagline'])

desc_clean = []
for s in desc:
    s = str(s).replace('\n', ' ')
    s = s.replace('\t', ' ')
    s = s.replace('\r', ' ')
    desc_clean.append(s)
    
idata_['desc_clean'] = desc_clean

In [189]:
desc_len = [len(re.findall(r'\w+', str(d))) for d in desc]
len(desc_len)

211176

In [190]:
idata_['desc_len'] = desc_len

In [191]:
label = [str(int(i)) for i in idata_['in_forever_funding']]
idata_['label'] = label
idata_.head()

Unnamed: 0,id,title,balance,tagline,card_type,category_slug,currency_code,in_forever_funding,cached_collected_pledges_count,goal,desc_clean,desc_len,label
0,2237863,Pi SDR for radio amateurs,"$1,570",A great solution for SDR1000 and Softrock/Ense...,project,tech-innovation,USD,False,6,1570,A great solution for SDR1000 and Softrock/Ense...,12,0
2,2286181,Magnetic Glowing Coasters,$3,We need your support in launching our new prod...,project,productivity,USD,False,1,3,We need your support in launching our new prod...,12,0
9,2024227,TOC - GOOD VIBES,$162,TOC is a revolutionary device that helps babie...,project,travel-outdoors,USD,False,3,162,TOC is a revolutionary device that helps babie...,15,0
10,2278591,Selsabel Hair Care & Advanced Organic Formula,$44,"Unique Hair Care for All Hair Types, Optimizes...",project,health-fitness,USD,False,1,44,"Unique Hair Care for All Hair Types, Optimizes...",16,0
12,1902423,Dubblecup: Ceramic Double Cup,$170,"Microwave-Safe, Dishwasher-Safe, Baller-Tier D...",project,home,USD,False,6,170,"Microwave-Safe, Dishwasher-Safe, Baller-Tier D...",8,0


In [192]:
# create .tsv file with textual feature only
cols = ['label', 'id', 'id', 'desc_clean', 'desc_clean']
itext_only = idata_.loc[:, cols]

# shuffle the data and split it into train, dev, and test set.
itext_only = shuffle(itext_only)

n_train = round(0.9 * itext_only.shape[0])
n_test = int((itext_only.shape[0] - n_train) / 2)
# print(n_test, n_train)

itext_only_train = itext_only.iloc[:n_train]
itext_only_dev = itext_only.iloc[n_train:(n_train + n_test)]
itext_only_test = itext_only.iloc[(n_train + n_test):]

itext_only_train.to_csv('INDI/text_only_train.tsv', sep='\t', index=None)
itext_only_dev.to_csv('INDI/text_only_dev.tsv', sep='\t', index=None)
itext_only_test.to_csv('INDI/text_only_test.tsv', sep='\t', index=None)

## moved all three .tsv files under bert-classifier/glue-data/INDI

In [193]:
# create train/dev/test set for baseline and other models
select_col_list = ['desc_clean', 'desc_len', 'goal', 'category_slug', 'label']

train_df = id_generate_set(itext_only_train, idata_, 'id', select_col_list)
dev_df = id_generate_set(itext_only_dev, idata_, 'id', select_col_list)
test_df = id_generate_set(itext_only_test, idata_, 'id', select_col_list)

train_df.to_csv('INDI/textandmeta_train.tsv', sep='\t', index=None)
dev_df.to_csv('INDI/textandmeta_dev.tsv', sep='\t', index=None)
test_df.to_csv('INDI/textandmeta_test.tsv', sep='\t', index=None)