In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os
import seaborn as sns
from collections import Counter

In [2]:
comp = pd.read_csv("meta_start_companies_filtered_labeled.csv")

In [3]:
print(comp.shape)
comp.head(10)

(32698, 9)


Unnamed: 0,category_list,funding_total_usd,country_code,funding_rounds,Num_of_investor,funding_duration,first_funding_at_UTC,last_funding_at_UTC,label
0,Curated Web,2000000.0,CHN,1,1,0,13957,13957,0
1,Games,41250.0,HKG,1,1,0,16252,16252,0
2,Analytics,33600000.0,USA,4,14,1040,15708,16748,0
3,Mobile,1150050.0,USA,3,13,931,15175,16106,0
4,E-Commerce,40000.0,CHL,1,1,0,15754,15754,0
5,Apps|Cable|Distribution|Software,5000000.0,USA,1,1,0,16511,16511,1
6,Search,40000.0,USA,1,1,0,15209,15209,0
7,Art|E-Commerce|Marketplaces,500000.0,USA,1,3,0,14379,14379,1
8,Curated Web,2535000.0,USA,2,13,411,14610,15021,1
9,Curated Web,6795451.0,USA,9,13,2221,14531,16752,0


In [4]:
comp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32698 entries, 0 to 32697
Data columns (total 9 columns):
category_list           32698 non-null object
funding_total_usd       32698 non-null float64
country_code            32698 non-null object
funding_rounds          32698 non-null int64
Num_of_investor         32698 non-null int64
funding_duration        32698 non-null int64
first_funding_at_UTC    32698 non-null int64
last_funding_at_UTC     32698 non-null int64
label                   32698 non-null int64
dtypes: float64(1), int64(6), object(2)
memory usage: 2.2+ MB


In [5]:
# comp.groupby('label').count()

In [6]:
comp.groupby('label')['country_code'].count()

label
0    27800
1     4898
Name: country_code, dtype: int64

In [7]:
comp.loc[5,'label']

1

# create subdataframe for each category & delete the category with less 6 examples (but do not delete the examples)
1. create/load comp and remian_comp_idx_set
2. create cat_comp_dict
3. calculate the number of examples for each cat, only keep the categories with enough examples: create sorted remian_cat_stat_df and store the deleted cat_dict in a dictionary called 'comp_num_less_than'. (exclude the categories with less 6 companies but do not delete the examples)
4. use the remain categories to choose test dataset (the last 100 categories)
5. calculate the test_cat_set, test_companies_idx_set, test_cat_comp_dict and create test sub_dataset
6. update remian_comp_idx_set and cat_comp_dict: a) remove test_companies_idx_set from remian_comp_idx_set, b) delete test_companies_idx_set from cat_comp_dict's values, only keep valid categories in cat_comp_dict's keys.

7. repeat 3-6 to for dev set and training set

In [8]:
# step 1
total_comp_idx_set = set(comp.index)
comp.shape

(32698, 9)

In [9]:
len(total_comp_idx_set)

32698

In [10]:
# for idx, s in enumerate(comp['category_list']):
#     print(idx)
#     print(s)
#     break

In [11]:
# dict(comp['category_list'])

In [12]:
# step 2
def create_cat_comp_dict(df, col='category_list'):
    cat_comp_dict = {}
    comp_cat_dict = {}
    l_maxlen = 0
    
    cat_list_to_dict = dict(df[col])
    for idx, s in cat_list_to_dict.items():
        if type(s) != str:
            continue
        l = s.split("|")
        comp_cat_dict[idx] = l
        if len(l) > l_maxlen:
            l_maxlen = len(l)
            l_max_show = l
    #         print(l_max_show)
        for i in l:
            if i not in cat_comp_dict.keys():
                cat_comp_dict[i] = []
            cat_comp_dict[i].append(idx)
    
    return cat_comp_dict, comp_cat_dict, l_maxlen, l_max_show

In [13]:
cat_comp_dict_all, com_cat_dict_all, l_maxlen_all, l_max_show_all = create_cat_comp_dict(comp)

In [14]:
available_cat_set_all = set(cat_comp_dict_all.keys())
len(available_cat_set_all)

833

In [15]:
print(l_maxlen_all, l_max_show_all)

23 ['Advertising', 'Android', 'Apps', 'Discounts', 'Gps', 'Indoor Positioning', 'Information Technology', 'iPhone', 'Local Advertising', 'Local Search', 'Location Based Services', 'Mobile', 'Mobile Advertising', 'Navigation', 'Personalization', 'Private Social Networking', 'Reviews and Recommendations', 'Services', 'Social Media', 'Social Network Media', 'Technology', 'Windows Phone 7', 'Wireless']


## Choose the minimum number of positive examples for every category

In [55]:
MIN_POS_NUM = 3

In [56]:
# step 3
# we only take count of the category with more than X positive companies.
# tot_comp_num = len(total_comp_idx_set)

def calculate_valid_cat_stat(cat_comnp_dict, comp_df, at_least_pos_num):
    valid_cat_stat_dict = {}
    
    for k, v in cat_comnp_dict.items():
        pos_num = 0
        for i in v:
            if comp_df.loc[i,'label'] == 1:
                pos_num += 1
        if pos_num >= at_least_pos_num:
            valid_cat_stat_dict[k] = {'num_comp': len(v), 'num_positive_example': pos_num}
    valid_cat_stat_df = pd.DataFrame.from_dict(valid_cat_stat_dict, orient='index')
    valid_cat_stat_df = valid_cat_stat_df.sort_values('num_comp',ascending=False)
    
    valid_cat_set = set(valid_cat_stat_df.index)
    return valid_cat_stat_dict, valid_cat_stat_df, valid_cat_set

In [57]:
def calculate_comp_idx_set(cat_set, cat_comp_dict):
    comp_idx_set = set()
    for k in cat_set:
        for i in cat_comp_dict[k]:
            comp_idx_set.add(i)
    return comp_idx_set

In [58]:
valid_cat_stat_all_dict, valid_cat_stat_all_df, valid_cat_set = calculate_valid_cat_stat(cat_comp_dict_all,
                                                                             comp, 
                                                                             at_least_pos_num=MIN_POS_NUM)

In [59]:
len(valid_cat_set)

409

# step 4 & 5: 
create test set (select the last 100 categories (with least examples) as test dataset)

In [60]:
def split_test_dev_dataset(num_cat, remain_cat_stat_df, cat_comp_dict, comp_idx_set):
    test_stat_df = remain_cat_stat_df.tail(num_cat)
    test_cat_set = set(test_stat_df.index)
    
    test_cat_comp_dict = {}
    for cat in test_cat_set:
        comp_ids = cat_comp_dict[cat]
        test_cat_comp_dict[cat] = comp_ids
    
    test_comp_idx_set = calculate_comp_idx_set(test_cat_set, cat_comp_dict)
    
    remain_cat_set = set(remain_cat_stat_df.index) - test_cat_set
    remain_comp_idx_set = comp_idx_set - test_comp_idx_set
    
    return test_cat_set, test_cat_comp_dict, test_comp_idx_set, remain_cat_set, remain_comp_idx_set

In [61]:
(test_cat_set, test_cat_comp_dict, test_comp_idx_set, 
 aftertest_remain_cat_set, aftertest_remain_comp_idx_set) = split_test_dev_dataset(50, valid_cat_stat_all_df,
                                                                                 cat_comp_dict_all,
                                                                                 total_comp_idx_set)

In [62]:
len(aftertest_remain_cat_set)

359

In [63]:
len(test_comp_idx_set)

874

In [64]:
## step 6
def update_cat_comp_dict(old_cat_comp_dict, remain_cat_set, remain_comp_idx_set):
    new_cat_comp_dict = {}
    for cat in remain_cat_set:
        old_comp_idx_list = old_cat_comp_dict[cat]
        new_cat_comp_dict[cat] = []
        for idx in old_comp_idx_list:
            if idx in remain_comp_idx_set:
                new_cat_comp_dict[cat].append(idx)
    return new_cat_comp_dict

In [65]:
len(aftertest_remain_comp_idx_set)

31824

In [66]:
32698 - len(test_comp_idx_set)

31824

In [67]:
aftertest_cat_comp_dict = update_cat_comp_dict(cat_comp_dict_all, aftertest_remain_cat_set,
                                               aftertest_remain_comp_idx_set)

In [68]:
# updated_cat_comp_dict_aft

## Repeat step 3-6 for dev set

In [69]:
# step 3
aftertest_cat_stat_dict, aftertest_cat_stat_df, aftertest_valid_cat_set = calculate_valid_cat_stat(aftertest_cat_comp_dict,
                                                                           comp,
                                                                           MIN_POS_NUM)

In [70]:
len(aftertest_valid_cat_set)

345

In [71]:
# step 4 & 5
aftertest_cat_set = aftertest_cat_stat_dict.keys()
len(aftertest_cat_set)

345

In [72]:
# step 4 & 5
(dev_cat_set, dev_cat_comp_dict, dev_comp_idx_set,
 afterdev_remain_cat_set, afterdev_remain_comp_idx_set) = split_test_dev_dataset(50,
                                                                          aftertest_cat_stat_df,
                                                                          aftertest_cat_comp_dict,
                                                                          aftertest_remain_comp_idx_set)

In [73]:
len(afterdev_remain_cat_set)

295

In [74]:
len(dev_comp_idx_set)

1378

In [75]:
len(afterdev_remain_comp_idx_set)

30446

In [76]:
len(aftertest_remain_comp_idx_set) - len(dev_comp_idx_set)

30446

In [77]:
# step 6
afterdev_cat_comp_dict = update_cat_comp_dict(aftertest_cat_comp_dict, afterdev_remain_cat_set,
                                               afterdev_remain_comp_idx_set)

In [78]:
len(afterdev_cat_comp_dict)

295

## the remaining cat_comp_dict is training part!

In [79]:
train_cat_stat_dict, train_cat_stat_df, train_cat_set = calculate_valid_cat_stat(afterdev_cat_comp_dict,
                                                                                        comp,
                                                                                        MIN_POS_NUM)

In [80]:
len(train_cat_set)

289

In [81]:
train_cat_comp_dict = {}
for cat in train_cat_set:
    train_cat_comp_dict[cat] = afterdev_cat_comp_dict[cat]
    

In [82]:
train_comp_idx_set = calculate_comp_idx_set(train_cat_set, train_cat_comp_dict)

In [83]:
len(train_comp_idx_set)

30035

In [84]:
len(train_comp_idx_set) + len(dev_comp_idx_set) + len(test_comp_idx_set)

32287

## Create the data file after split

In [85]:
from encoding_data import encode_dataset, separate_input_output_cols
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [86]:
total_used_comp_idx_list = []
total_used_comp_idx_list.extend(list(train_comp_idx_set))
total_used_comp_idx_list.extend(list(dev_comp_idx_set))
total_used_comp_idx_list.extend(list(test_comp_idx_set))

In [87]:
used_comp_df = comp.loc[total_used_comp_idx_list]
used_comp_df = used_comp_df.drop(columns=['category_list'])

In [88]:
used_comp_dict = used_comp_df.to_dict('index')

In [89]:
preprared_data = {'train': train_cat_comp_dict,
                  'dev': dev_cat_comp_dict,
                  'test': test_cat_comp_dict,
                  'companies': used_comp_dict}

In [90]:
data_dir = '/Users/cicipan/projects/Predict-Success-of-Startups/data/preprocess_data'
file_name = 'splitted_data_pos_{}.json'.format(MIN_POS_NUM)

In [91]:
with open(os.path.join(data_dir, file_name), 'w') as f:
    json.dump(preprared_data, f)

In [92]:
used_comp_df.head(10)

Unnamed: 0,funding_total_usd,country_code,funding_rounds,Num_of_investor,funding_duration,first_funding_at_UTC,last_funding_at_UTC,label
0,2000000.0,CHN,1,1,0,13957,13957,0
1,41250.0,HKG,1,1,0,16252,16252,0
2,33600000.0,USA,4,14,1040,15708,16748,0
3,1150050.0,USA,3,13,931,15175,16106,0
4,40000.0,CHL,1,1,0,15754,15754,0
6,40000.0,USA,1,1,0,15209,15209,0
7,500000.0,USA,1,3,0,14379,14379,1
8,2535000.0,USA,2,13,411,14610,15021,1
9,6795451.0,USA,9,13,2221,14531,16752,0
10,10661770.0,FRA,5,17,1054,15689,16743,0


In [93]:
used_comp_dict[0]

{'funding_total_usd': 2000000.0,
 'country_code': 'CHN',
 'funding_rounds': 1,
 'Num_of_investor': 1,
 'funding_duration': 0,
 'first_funding_at_UTC': 13957,
 'last_funding_at_UTC': 13957,
 'label': 0}

## Don't use below part to prepare meta-learning data

## preprocess examples: encode country_code, normalize numerical data

In [111]:
data_dir = '/Users/cicipan/projects/Predict-Success-of-Startups/data'
with open(os.path.join(data_dir, 'preprocess_data/splitted_data.json')) as f:
    data = json.load(f)
    

In [141]:
train_cat_comp_dict = data['train']
dev_cat_comp_dict = data['dev']
test_cat_comp_dict = data['test']
comp_idx_dict_str_key = data['companies']


In [142]:
# change the keys of data['companies'] from string to integer
comp_idx_dict = {}
for strk, v in comp_idx_dict_str_key.items():
    k = int(strk)
    comp_idx_dict[k] = v

In [143]:
comp_idx_dict[0]

{'funding_total_usd': 2000000.0,
 'country_code': 'CHN',
 'funding_rounds': 1,
 'Num_of_investor': 1,
 'funding_duration': 0,
 'first_funding_at_UTC': 13957,
 'last_funding_at_UTC': 13957,
 'label': 0}

In [145]:
comp_df = pd.DataFrame.from_dict(comp_idx_dict, orient='index')
comp_df.head()

Unnamed: 0,funding_total_usd,country_code,funding_rounds,Num_of_investor,funding_duration,first_funding_at_UTC,last_funding_at_UTC,label
0,2000000.0,CHN,1,1,0,13957,13957,0
1,41250.0,HKG,1,1,0,16252,16252,0
2,33600000.0,USA,4,14,1040,15708,16748,0
3,1150050.0,USA,3,13,931,15175,16106,0
4,40000.0,CHL,1,1,0,15754,15754,0


In [146]:
train_comp_idx_set = calculate_comp_idx_set(train_cat_comp_dict.keys(),train_cat_comp_dict)
dev_comp_idx_set = calculate_comp_idx_set(dev_cat_comp_dict.keys(),dev_cat_comp_dict)
test_comp_idx_set = calculate_comp_idx_set(test_cat_comp_dict.keys(),test_cat_comp_dict)
len(test_comp_idx_set)


759

In [149]:
train_examples_df = comp_df.loc[list(train_comp_idx_set)]
dev_examples_df = comp_df.loc[list(dev_comp_idx_set)]
test_examples_df = comp_df.loc[list(test_comp_idx_set)]
test_examples_df.head()

Unnamed: 0,funding_total_usd,country_code,funding_rounds,Num_of_investor,funding_duration,first_funding_at_UTC,last_funding_at_UTC,label
26627,4115591.0,USA,3,1,1148,15562,16710,0
5,5000000.0,USA,1,1,0,16511,16511,1
20486,50000.0,RUS,2,1,299,15650,15949,0
18437,300000.0,RUS,1,2,0,15793,15793,0
22536,5564999.0,DEU,3,1,555,15670,16225,0


In [163]:
metadata = {'output_label': ['label'],
            'input_categorical': ['country_code'],
            'input_int': ['funding_rounds', 'Num_of_investor', 'funding_duration', 'first_funding_at_UTC',
                          'last_funding_at_UTC'],
            'input_float': ['funding_total_usd'],
            'input_text': [],
            'input_datetime': [],
            'input_bool': []
            }

In [164]:
ytrain, Xtrain, dv, scaler, _, cols_name = encode_dataset(train_examples_df, metadata)
ydev, Xdev, _, _, _, _ = encode_dataset(dev_examples_df, metadata, dv=dv, scaler=scaler)
ytest, Xtest, _, _, _, _ = encode_dataset(test_examples_df, metadata, dv=dv, scaler=scaler)

Starting to encode inputs...
Except categorical and text input data after encoding, the shape is (30639, 6)
we have 6 columns.
Non-text input data after encoding, the shape is (30639, 123)
We have 123 columns.
Starting to encode inputs...
Except categorical and text input data after encoding, the shape is (1273, 6)
we have 6 columns.
Non-text input data after encoding, the shape is (1273, 123)
We have 123 columns.
Starting to encode inputs...
Except categorical and text input data after encoding, the shape is (759, 6)
we have 6 columns.
Non-text input data after encoding, the shape is (759, 123)
We have 123 columns.


## Baseline Models, including logistic regression, random forests, knn, neural network

In [176]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn import ensemble
from sklearn.metrics import accuracy_score, f1_score

In [173]:
1-np.mean(ydev)

0.8892380204241949

In [180]:
####  Random Forest  ####
model_RF = sklearn.ensemble.RandomForestClassifier(n_estimators=100) 
model_RF.fit(Xtrain, ytrain)

y_pred = model_RF.predict(Xdev)
acc = accuracy_score(ydev, y_pred)
f1 = f1_score(ydev, y_pred)
print('accuracy: {}'.format(acc))
print('f1 score: {}'.format(f1))

  This is separate from the ipykernel package so we can avoid doing imports until


accuracy: 0.8923802042419482
f1 score: 0.35071090047393366


In [181]:
# Logistic Regression
model_LR = LogisticRegression()
model_LR.fit(Xtrain, ytrain)

y_pred = model_LR.predict(Xdev)
accuracy_score(ydev, y_pred)
acc = accuracy_score(ydev, y_pred)
f1 = f1_score(ydev, y_pred)
print('accuracy: {}'.format(acc))
print('f1 score: {}'.format(f1))

accuracy: 0.8782403770620582
f1 score: 0.20512820512820515


  y = column_or_1d(y, warn=True)


In [182]:
# KNN 
model_knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=5)
model_knn.fit(Xtrain, ytrain)

y_pred = model_knn.predict(Xdev)
accuracy_score(ydev, y_pred)
f1 = f1_score(ydev, y_pred)
print('accuracy: {}'.format(acc))
print('f1 score: {}'.format(f1))

  This is separate from the ipykernel package so we can avoid doing imports until


accuracy: 0.8782403770620582
f1 score: 0.34513274336283184


## NN model

In [193]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from keras import backend as K

In [184]:
n_features = Xtrain.shape[1]

hyp_params = {
    'fc_hidden_size': 64,
    'n_fc_layers': 4,
    'lr': 0.001
}

In [194]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [195]:
inputs = keras.Input(shape=(n_features,), name='input_features')
x = layers.Dense(hyp_params['fc_hidden_size'], activation='relu')(inputs)

for _ in range(hyp_params['n_fc_layers']-1):
    x = layers.Dense(hyp_params['fc_hidden_size'], activation='relu')(x)
    # x = layers.Dropout(hyp_params['dropout_rate'])(x)

outputs = layers.Dense(1, activation='sigmoid')(x)

model = keras.Model(inputs=inputs, outputs=outputs)

opt = optimizers.Adam(lr=hyp_params['lr'])

model.compile(loss='binary_crossentropy',
optimizer=opt,
metrics=['acc',f1_m,precision_m, recall_m])

# history = model.fit(X_train_con, y_train, validation_data=(X_dev_con, y_dev), epochs=200, verbose=1)
model.fit(Xtrain, ytrain, validation_data=(Xdev, ydev), 
          epochs=100, 
          verbose=2
         )

Train on 30639 samples, validate on 1273 samples
Epoch 1/100
 - 2s - loss: 0.3461 - acc: 0.8517 - f1_m: 0.2081 - precision_m: 0.3727 - recall_m: 0.1641 - val_loss: 0.2577 - val_acc: 0.8892 - val_f1_m: 0.1402 - val_precision_m: 0.2849 - val_recall_m: 0.0972
Epoch 2/100
 - 1s - loss: 0.3320 - acc: 0.8540 - f1_m: 0.2504 - precision_m: 0.4539 - recall_m: 0.1924 - val_loss: 0.2666 - val_acc: 0.8885 - val_f1_m: 0.2638 - val_precision_m: 0.4457 - val_recall_m: 0.2064
Epoch 3/100
 - 1s - loss: 0.3296 - acc: 0.8561 - f1_m: 0.2691 - precision_m: 0.4731 - recall_m: 0.2107 - val_loss: 0.2623 - val_acc: 0.8892 - val_f1_m: 0.1455 - val_precision_m: 0.3394 - val_recall_m: 0.1001
Epoch 4/100
 - 1s - loss: 0.3280 - acc: 0.8553 - f1_m: 0.2591 - precision_m: 0.4691 - recall_m: 0.1987 - val_loss: 0.2576 - val_acc: 0.8908 - val_f1_m: 0.2960 - val_precision_m: 0.4679 - val_recall_m: 0.2492
Epoch 5/100
 - 1s - loss: 0.3266 - acc: 0.8569 - f1_m: 0.2735 - precision_m: 0.4885 - recall_m: 0.2109 - val_loss: 0.25

 - 1s - loss: 0.2954 - acc: 0.8688 - f1_m: 0.3955 - precision_m: 0.6044 - recall_m: 0.3264 - val_loss: 0.2803 - val_acc: 0.8853 - val_f1_m: 0.2592 - val_precision_m: 0.3972 - val_recall_m: 0.2200
Epoch 41/100
 - 1s - loss: 0.2939 - acc: 0.8694 - f1_m: 0.3910 - precision_m: 0.6032 - recall_m: 0.3181 - val_loss: 0.2779 - val_acc: 0.8916 - val_f1_m: 0.2698 - val_precision_m: 0.4047 - val_recall_m: 0.2338
Epoch 42/100
 - 1s - loss: 0.2931 - acc: 0.8695 - f1_m: 0.3889 - precision_m: 0.5911 - recall_m: 0.3202 - val_loss: 0.2748 - val_acc: 0.8892 - val_f1_m: 0.1937 - val_precision_m: 0.3163 - val_recall_m: 0.1554
Epoch 43/100
 - 1s - loss: 0.2931 - acc: 0.8698 - f1_m: 0.3849 - precision_m: 0.5854 - recall_m: 0.3159 - val_loss: 0.2824 - val_acc: 0.8853 - val_f1_m: 0.1780 - val_precision_m: 0.3163 - val_recall_m: 0.1387
Epoch 44/100
 - 1s - loss: 0.2921 - acc: 0.8697 - f1_m: 0.3794 - precision_m: 0.5833 - recall_m: 0.3120 - val_loss: 0.2723 - val_acc: 0.8908 - val_f1_m: 0.2175 - val_precision_m

Epoch 80/100
 - 2s - loss: 0.2706 - acc: 0.8796 - f1_m: 0.4518 - precision_m: 0.6579 - recall_m: 0.3788 - val_loss: 0.3325 - val_acc: 0.8822 - val_f1_m: 0.2941 - val_precision_m: 0.4039 - val_recall_m: 0.2610
Epoch 81/100
 - 1s - loss: 0.2694 - acc: 0.8795 - f1_m: 0.4422 - precision_m: 0.6572 - recall_m: 0.3667 - val_loss: 0.3167 - val_acc: 0.8822 - val_f1_m: 0.2584 - val_precision_m: 0.4114 - val_recall_m: 0.2128
Epoch 82/100
 - 2s - loss: 0.2669 - acc: 0.8803 - f1_m: 0.4522 - precision_m: 0.6445 - recall_m: 0.3829 - val_loss: 0.3381 - val_acc: 0.8885 - val_f1_m: 0.2486 - val_precision_m: 0.3813 - val_recall_m: 0.2024
Epoch 83/100
 - 1s - loss: 0.2666 - acc: 0.8800 - f1_m: 0.4507 - precision_m: 0.6638 - recall_m: 0.3748 - val_loss: 0.3286 - val_acc: 0.8885 - val_f1_m: 0.2662 - val_precision_m: 0.3611 - val_recall_m: 0.2317
Epoch 84/100
 - 1s - loss: 0.2667 - acc: 0.8815 - f1_m: 0.4575 - precision_m: 0.6591 - recall_m: 0.3856 - val_loss: 0.3421 - val_acc: 0.8853 - val_f1_m: 0.2347 - va

<tensorflow.python.keras.callbacks.History at 0x138957400>

In [196]:
# evaluate the model
_, train_acc, train_f1_score, train_precision, train_recall = model.evaluate(Xtrain, ytrain, verbose=1)
_, dev_acc, dev_f1_score, dev_precision, dev_recall = model.evaluate(Xdev, ydev, verbose=1)
print('train_acc: %.3f, train_f1_score: %.3f' % (train_acc, train_f1_score))
print('dev_acc: %.3f, dev_f1_score: %.3f' % (dev_acc, dev_f1_score))

train_acc: 0.888, train_f1_score: 0.494
dev_acc: 0.895, dev_f1_score: 0.309
