In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib as plt
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle 

  (fname, cnt))
  (fname, cnt))


In [3]:
# df = pd.read_csv('train_balanced.csv')
# df = shuffle(df)
# df.to_csv('train_shuffled.csv')
df = pd.read_csv('train_shuffled.csv')

In [4]:
# count the number of successful projects and failed projects
succ_num = sum(df['final_status'] == 1)
print(succ_num)
total_num = df.shape[0]
print(total_num)
print(1- succ_num/total_num)

34561
108129
0.6803725180108944


In [5]:
# pd.set_option('display.max_colwidth', -1)

# Data preprocessing

0. make the dataset balanced (optional).
1. encode category features.
2. encode text features.
3. modify the dataset: add 'duration', drop colunms as 'name', 'project_id', etc.
4. split dataset to training, dev and test set (90%, 5%, 5%).

In [6]:
# Make the dataset balance: half successful projects and half failed projects
def balance(df):
    # seperate successful projects and failed projects
    df_succ = df[df['final_status'] == 1]
    df_fail = df[df['final_status'] == 0]
    # duplicate successful projects
    df_succ_copy = df_succ.copy()
    # random select failed projects and its amount equals to 2 times of sucessful projects
    df_fail_sel = df_fail.sample(n = succ_num*2)
    # concat the 3 dataframes
    df_balance = pd.concat([df_succ, df_succ_copy, df_fail_sel], axis=0)
    # shuffle the concated dataframe
    df_balance = shuffle(df_balance)
    return df_balance

In [7]:
# Encode 'category' features, label them with values between 0 and n_classes-1
def encoder_cat(df, col):
    le = preprocessing.LabelEncoder()
    col_label = le.fit_transform(df[col])
    df[col]=pd.Series(col_label)
    return le

In [8]:
# encode text features
def encoder_text(df, col, min_df=10):
    df[col] = df[col].astype(str)
    vectorizer = CountVectorizer(min_df=min_df)
    vectorizer.fit(df[col])
    col_bag_of_words = vectorizer.transform(df[col])
    return col_bag_of_words

In [9]:
def modify(df):
    # add a new colunm ‘duration’
    df['duration'] = df['deadline'] - df['launched_at'] 
    # drop unused colunms
    df = df.drop(columns=['project_id', 'name', 'desc', 'keywords', 'deadline', 'state_changed_at', 'created_at', 'launched_at', 'backers_count', 
                          'final_status'])
    encoder_cat(df, 'country')
    encoder_cat(df, 'currency')
    encoder_cat(df, 'disable_communication')
    return df

In [10]:
def data_split(df):
    n = float(len(df_data))
    n_train = int(n * 0.9)
    n_dev = int(n * 0.05)
    
    training_set = df[:n_train]
    dev_set = df[n_train : (n_train + n_dev)]
    test_set = df[(n_train + n_dev) :]
    return training_set, dev_set, test_set

# Baseline Model
1. Random Forest Classifier Model (decison tree)
2. SVM
3. Linear Logistic Regression

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model, datasets
from sklearn import svm
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack
from sklearn import preprocessing

In [12]:
df_data = modify(df)
df_keywords_encoded = encoder_text(df, 'keywords', min_df=10)
df_desc_encoded = encoder_text(df, 'desc', min_df=20)
df_labels = df['final_status']

In [13]:
training_data, dev_data, test_data = data_split(df_data)
training_kw, dev_kw, test_kw = data_split(df_keywords_encoded)
training_desc, dev_desc, test_desc = data_split(df_desc_encoded)

training_Y, dev_Y, test_Y = data_split(df_labels)

In [14]:
scaler = preprocessing.StandardScaler().fit(training_data)
training_data = scaler.transform(training_data)
dev_data = scaler.transform(dev_data)
test_data = scaler.transform(test_data)

In [15]:
# concatinate inputs to ONE single input X
training_X = hstack([training_data, training_kw, training_desc])
dev_X = hstack([dev_data, dev_kw, dev_desc])
test_X = hstack([test_data, test_kw, test_desc])

In [16]:
1 - sum(dev_Y) / len(dev_Y)

0.6742508324084351

In [18]:
# RFC
# model = RandomForestClassifier(n_estimators=20)
# %time model.fit(training_X, training_Y)
# dev_pred = model.predict(dev_X)
# print('Accuracy: %f' % accuracy_score(dev_Y, dev_pred))

In [19]:
model = RandomForestClassifier(n_estimators=100)
%time model.fit(training_X, training_Y)
dev_pred = model.predict(dev_X)
print('Accuracy: %f' % accuracy_score(dev_Y, dev_pred))

CPU times: user 5min 29s, sys: 172 ms, total: 5min 29s
Wall time: 5min 30s
Accuracy: 0.700703


In [18]:
# model = RandomForestClassifier(n_estimators=500)
# %time model.fit(training_X, training_Y)
# dev_pred = model.predict(dev_X)
# print('Accuracy: %f' % accuracy_score(dev_Y, dev_pred))

In [19]:
# SVM
model = svm.LinearSVC()
%time model.fit(training_X, training_Y)
dev_pred = model.predict(dev_X)
print('accuracy is: %s' %accuracy_score(dev_Y, dev_pred))

CPU times: user 21.8 s, sys: 0 ns, total: 21.8 s
Wall time: 21.8 s
accuracy is: 0.6712911579726231


In [24]:
# Linear Logistic Regression
model = linear_model.LogisticRegression()
%time model.fit(training_X, training_Y)
dev_pred = model.predict(dev_X)
print('accuracy is: %s' %accuracy_score(dev_Y, dev_pred))

CPU times: user 3.34 s, sys: 4 ms, total: 3.34 s
Wall time: 3.34 s
accuracy is: 0.6727709951905291


# Neural Network
1. Shallow NN Model
2. RNN Model

In [27]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Merge
from keras.layers import Input
from keras.layers.embeddings import Embedding
from keras import Model
from keras import regularizers

In [28]:
desc = df['desc']
kw = df['keywords']

In [29]:
# integer encode the text features
vocab_size = 10000
encoded_desc = [one_hot(d, vocab_size) for d in desc]
encoded_kw = [one_hot(d, vocab_size) for d in kw]

In [30]:
# pad documents to a max length of words
padded_desc = pad_sequences(encoded_desc, maxlen=20, padding='post')
padded_kw = pad_sequences(encoded_kw, maxlen=5, padding='post')

In [31]:
training_desc, dev_desc, test_desc = data_split(padded_desc)
training_kw, dev_kw, test_kw  = data_split(padded_kw)

In [38]:
scaler = preprocessing.StandardScaler().fit(training_data)
training_data = scaler.transform(training_data)
dev_data = scaler.transform(dev_data)
test_data = scaler.transform(test_data)

In [36]:
desc_model = Sequential()
desc_model.add(Embedding(vocab_size, 50, input_length=20))
desc_model.add(Flatten())

kw_model = Sequential()
kw_model.add(Embedding(vocab_size, 50, input_length=5))
kw_model.add(Flatten())

data_model = Sequential()

inputs = Input(shape=(6,))
# a layer instance is callable on a tensor, and returns a tensor
x = Dense(64)(inputs)
data_model = Model(inputs=inputs, outputs=x)

model = Sequential()
model.add(Merge([desc_model, kw_model, data_model
                ], mode='concat')) # input size (None, 106)

model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(1, activation='sigmoid'))

# model.add(Flatten())
# model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
merge_2 (Merge)              (None, 1314)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 32)                42080     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 1,042,561
Trainable params: 1,042,561
Non-trainable params: 0
_________________________________________________________________
None




In [39]:
# evaluate the model
for _ in range(10):
    model.fit([training_desc, training_kw, training_data,], training_Y, epochs=1, verbose=1)
    loss, accuracy = model.evaluate([dev_desc, dev_kw, norm_dev_data], dev_Y, verbose=1)
    print('Accuracy: %f' % (accuracy*100))
    

Epoch 1/1
Accuracy: 69.348872
Epoch 1/1
Accuracy: 69.348872
Epoch 1/1
Accuracy: 67.258602
Epoch 1/1
Accuracy: 66.407695
Epoch 1/1
Accuracy: 64.002960
Epoch 1/1
Accuracy: 64.428413
Epoch 1/1
Accuracy: 64.816870
Epoch 1/1
Accuracy: 62.763596
Epoch 1/1
Accuracy: 63.706992
Epoch 1/1
Accuracy: 62.301147
