In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
import datetime
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_colwidth',100)
import pickle

In [2]:

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
proj_ids = test['project_id']

In [3]:
# to check the missing value in column
train.isnull().sum(axis=0)/train.shape[0]

project_id               0.000000
name                     0.000028
desc                     0.000083
goal                     0.000000
keywords                 0.000000
disable_communication    0.000000
country                  0.000000
currency                 0.000000
deadline                 0.000000
state_changed_at         0.000000
created_at               0.000000
launched_at              0.000000
backers_count            0.000000
final_status             0.000000
dtype: float64

In [4]:

unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))

In [5]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

for i in np.arange(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)

In [6]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()

test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()

In [7]:
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()

test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()

In [8]:

unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))

In [9]:
time1 = []
time3 = []
for i in np.arange(train.shape[0]):
    time1.append(np.round((train.loc[i, 'launched_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))
    time3.append(np.round((train.loc[i, 'deadline'] - train.loc[i, 'launched_at']).total_seconds()).astype(int))

In [10]:
train['time1'] = np.log(time1)
train['time3'] = np.log(time3)

In [11]:
time5 = []
time6 = []
for i in np.arange(test.shape[0]):
    time5.append(np.round((test.loc[i, 'launched_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))
    time6.append(np.round((test.loc[i, 'deadline'] - test.loc[i, 'launched_at']).total_seconds()).astype(int))

In [12]:
test['time1'] = np.log(time5)
test['time3'] = np.log(time6)

In [13]:
feat = ['disable_communication','country']

In [14]:
for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x].values) + list(test[x].values))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x]))
    

In [15]:
train.head()


Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,...,backers_count,final_status,name_len,desc_len,name_count,desc_count,keywords_len,keywords_count,time1,time3
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them too. so i thought i would suggest something for m...,20.0,drawing-for-dollars,0,20,USD,2009-05-03 12:29:59,2009-05-03 12:30:17,...,3,1,19,134,3.0,26.0,19,3,7.703459,13.502546
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in Residence in Kankakee Illinois,"I, Dereck Blackburn will be taking upon an incredible journey in the month of May 2009. I will b...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-residence-in-kankakee-illinois,0,20,USD,2009-05-16 04:40:00,2009-05-16 05:30:18,...,2,0,76,137,10.0,25.0,74,10,9.640043,14.189422
2,kkst183622197,Mr. Squiggles,"So I saw darkpony's successfully funded drawing for dollars project and I thought """"""""""""""""""""""""""""...",30.0,mr-squiggles,0,20,USD,2009-05-23 02:56:00,2009-05-23 03:00:18,...,0,0,13,385,2.0,22.0,12,2,6.665684,13.668358
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and help me write my second novel!\r\n\r\nI have just ...,500.0,help-me-write-my-second-novel,0,20,USD,2009-05-29 05:39:00,2009-05-29 05:45:21,...,18,1,30,131,6.0,24.0,29,6,7.984463,14.766786
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, currently titled """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",2000.0,support-casting-my-sculpture-in-bronze,0,20,USD,2009-05-31 17:08:00,2009-05-31 17:15:17,...,1,0,38,384,6.0,18.0,38,6,7.873598,14.766913


In [16]:
train['goal'] = np.log1p(train['goal'])
test['goal'] = np.log1p(test['goal'])

In [17]:
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)

In [18]:
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1

kickdesc = kickdesc.map(desc_clean)

In [19]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]

stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]

kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]

kickdesc = [' '.join(x) for x in kickdesc]

In [20]:
cv = CountVectorizer(max_features=650)


In [21]:
alldesc = cv.fit_transform(kickdesc).todense()


In [22]:
combine = pd.DataFrame(alldesc)
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)

In [23]:
train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]

test_text.reset_index(drop=True,inplace=True)

In [24]:
cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time1','time3','goal']

In [25]:
target = train['final_status']

In [26]:
train = train.loc[:,cols_to_use]
test = test.loc[:,cols_to_use]

In [27]:
X_train = pd.concat([train, train_text],axis=1)
X_test = pd.concat([test, test_text],axis=1)

In [28]:
print(X_train.shape)
print(X_test.shape)

(108129, 659)
(63465, 659)


In [29]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=0)

dtrain = lgb.Dataset(X_train, y_train)
dval = lgb.Dataset(X_test, y_test)





In [30]:
params = {
    
    'num_leaves' : 256,
    'learning_rate':0.55,
    'metric':'auc',
    'objective':'binary',
    'early_stopping_round': 40,
    'max_depth':6,
    'bagging_fraction':0.5,
    'feature_fraction':0.6,
    'bagging_seed':2017,
    'feature_fraction_seed':2017,
    'verbose' : 1
    
    
}

In [31]:

clf = lgb.train(params, dtrain,num_boost_round=1000,valid_sets=dval,verbose_eval=10)




Training until validation scores don't improve for 40 rounds.
[10]	valid_0's auc: 0.689146
[20]	valid_0's auc: 0.688159
[30]	valid_0's auc: 0.687497
[40]	valid_0's auc: 0.687418
[50]	valid_0's auc: 0.685301
Early stopping, best iteration is:
[13]	valid_0's auc: 0.690857


In [32]:

preds = clf.predict(test[cols_to_use])

In [33]:
print(preds)
correlation = train.corr(method='pearson')
print(correlation)

[ 0.26868354  0.08872348  0.16924309 ...,  0.21771954  0.07523419
  0.41097119]
                name_len  desc_len  keywords_len  name_count  desc_count  \
name_len        1.000000  0.257125      0.336503    0.345182    0.065045   
desc_len        0.257125  1.000000      0.086445    0.097597    0.320690   
keywords_len    0.336503  0.086445      1.000000    0.850484    0.110142   
name_count      0.345182  0.097597      0.850484    1.000000    0.157178   
desc_count      0.065045  0.320690      0.110142    0.157178    1.000000   
keywords_count  0.337983  0.097032      0.913181    0.917339    0.147861   
time1           0.011170 -0.004001      0.071972    0.057268   -0.033006   
time3           0.026279  0.025544      0.018365    0.028932    0.029525   
goal           -0.012078  0.000697      0.025094    0.020921   -0.012158   

                keywords_count     time1     time3      goal  
name_len              0.337983  0.011170  0.026279 -0.012078  
desc_len              0.097032 -0

In [34]:
sub = pd.DataFrame(columns=['project-id','final-status'])

In [35]:
sub['project-id'] = proj_ids
sub['final-status'] = [1 if x > 0.5 else 0 for x in preds]
#print(sub['final-status'])

In [36]:
sub.to_csv("lgb_with_python.csv",index=False) 