In [2]:
import pandas as pd
import numpy as np
import re
import datetime
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

pd.set_option('display.max_colwidth',100)
import pickle


In [3]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
proj_ids = train['project_id']

In [4]:

unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))

In [5]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

for i in np.arange(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)

In [6]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()

test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()

In [7]:
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()

test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()

In [8]:

unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))

In [9]:
time1 = []
time3 = []
for i in np.arange(train.shape[0]):
    time1.append(np.round((train.loc[i, 'launched_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))
    time3.append(np.round((train.loc[i, 'deadline'] - train.loc[i, 'launched_at']).total_seconds()).astype(int))

In [10]:
train['time1'] = np.log(time1)
train['time3'] = np.log(time3)

In [11]:
time5 = []
time6 = []
for i in np.arange(test.shape[0]):
    time5.append(np.round((test.loc[i, 'launched_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))
    time6.append(np.round((test.loc[i, 'deadline'] - test.loc[i, 'launched_at']).total_seconds()).astype(int))

In [12]:
test['time1'] = np.log(time5)
test['time3'] = np.log(time6)

In [13]:
feat = ['disable_communication','country']

for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x].values) + list(test[x].values))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x]))

In [14]:
train['goal'] = np.log1p(train['goal'])
test['goal'] = np.log1p(test['goal'])

In [15]:
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)


In [16]:
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1

kickdesc = kickdesc.map(desc_clean)

In [None]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]
#baseword
stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]

kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]

kickdesc = [' '.join(x) for x in kickdesc]

In [None]:
cv = CountVectorizer(max_features=650)


In [None]:
alldesc = cv.fit_transform(kickdesc).todense()


In [None]:
combine = pd.DataFrame(alldesc)
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)

In [None]:
train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]

test_text.reset_index(drop=True,inplace=True)

In [None]:
cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time1','time3','goal']

In [None]:
target = train['final_status']
correlation = train.corr(method='pearson')
print(correlation)

In [None]:
train = train.loc[:,cols_to_use]
test = test.loc[:,cols_to_use]

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt

f, ax = plt.subplots(figsize=(10, 8))
correl = train.corr()
sns.heatmap(correl, mask=np.zeros_like(correl, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

NameError: name 'train' is not defined

In [24]:
X_train = pd.concat([train, train_text],axis=1)
X_test = pd.concat([test, test_text],axis=1)

In [25]:
print(X_train.shape)
print(X_test.shape)

(108129, 659)
(63465, 659)


In [26]:
dtrain = xgb.DMatrix(data=X_train, label = target)
dtest = xgb.DMatrix(data=X_test)

In [27]:
params = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'eta':0.025,
    'max_depth':6,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'min_child_weight':5
    
}

In [28]:
 
bst = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=40,nfold=5,verbose_eval=10)

[0]	train-error:0.312684+0.000471964	test-error:0.31431+0.00258479
[10]	train-error:0.309494+0.00118648	test-error:0.313191+0.00288106
[20]	train-error:0.308671+0.000651604	test-error:0.312618+0.00291807
[30]	train-error:0.307885+0.000544951	test-error:0.311461+0.003443
[40]	train-error:0.30598+0.000408615	test-error:0.310185+0.00340554
[50]	train-error:0.304188+0.000703103	test-error:0.308594+0.00297464
[60]	train-error:0.302771+0.000851092	test-error:0.30791+0.00270156
[70]	train-error:0.301388+0.000997155	test-error:0.307133+0.00246657
[80]	train-error:0.299672+0.000440693	test-error:0.305986+0.00222416
[90]	train-error:0.298535+0.000318058	test-error:0.305413+0.00232997
[100]	train-error:0.297513+0.000220505	test-error:0.304414+0.00232125
[110]	train-error:0.296512+0.000314392	test-error:0.30349+0.00206628
[120]	train-error:0.295702+0.000473539	test-error:0.302685+0.00205165
[130]	train-error:0.294828+0.000549855	test-error:0.302315+0.00213257
[140]	train-error:0.293721+0.000628399

In [None]:
bst_train = xgb.train(params, dtrain, num_boost_round=1000)


In [62]:
p_test = bst_train.predict(dtrain)
print(p_test)

[ 0.22425365  0.52926487  0.42408541 ...,  0.296397    0.10238023
  0.04680846]


In [69]:

sub = pd.DataFrame()
sub['project_id'] = proj_ids
sub['final_status'] = p_test

In [70]:

sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]


In [77]:
sub.to_csv("xgb_with_python_feats.csv",index=False) 


In [75]:
pickle.dump(bst_train, open("pima.pickle.dat", "wb"))

In [76]:
 loaded_model = pickle.load(open("pima.pickle.dat", "rb"))