In [2]:
import numpy as np
import pandas as pd
import pickle
import numpy as np

In [3]:
# INPUT CLEANED DATAFRAME 
df_questions = pd.read_pickle("./cleaned_trim/biology_trim_clean.dat")

In [4]:
# make tags column into list of tags
split_list = [tags.split(" ") for tags in list(df_questions['tags'])]
df_questions['tags'] = split_list

Unnamed: 0,text,tags
0,rnase contamination rna base experiment preven...,"[rna, biochemistry]"
1,lymphocyte size cluster two group tortora writ...,"[immunology, cell-biology, hematology]"


### reduce tags

In [5]:
split = [item for sublist in split_list for item in sublist]
df_tags = pd.DataFrame(split)
df_tags.columns = ['Tag']

In [6]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')
grouped_tags.Tag.describe()

count         655
unique        655
top       species
freq            1
Name: Tag, dtype: object

In [7]:
num_classes = 100
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

In [8]:
# only keep the tags in the top 100

# list of unique tags
unique = list(set(df_tags.Tag))
tmp = df_questions.tags[5]
print(tmp)

def tags_in_100(i, unique):
    "Input list of tags and top 100 tags. Return words in the original list that are also in top 100."
    values = [x for x in i if x in unique]
    return values

['evolution', 'mitochondria', 'chloroplasts']


In [9]:
only_top_100 = [tags_in_100(i, unique) for i in df_questions.tags]
pd.DataFrame({"original" :np.array(df_questions.tags), "top100":np.array(only_top_100)}).head(10)

Unnamed: 0,original,top100
0,"[rna, biochemistry]","[rna, biochemistry]"
1,"[immunology, cell-biology, hematology]","[immunology, cell-biology, hematology]"
2,"[dna, biochemistry, molecular-biology]","[dna, biochemistry, molecular-biology]"
3,"[neuroscience, synapses]",[neuroscience]
4,"[molecular-genetics, gene-expression, experime...","[molecular-genetics, gene-expression, experime..."
5,"[evolution, mitochondria, chloroplasts]",[evolution]
6,"[molecular-biology, synthetic-biology]","[molecular-biology, synthetic-biology]"
7,"[bioinformatics, homework]","[bioinformatics, homework]"
8,"[neuroscience, immunology]","[neuroscience, immunology]"
9,"[microbiology, virology, influenza]","[microbiology, virology]"


In [10]:
# Change question df to only include top 100
df_questions['tags'] = only_top_100
df_questions.head()

Unnamed: 0,text,tags
0,rnase contamination rna base experiment preven...,"[rna, biochemistry]"
1,lymphocyte size cluster two group tortora writ...,"[immunology, cell-biology, hematology]"
2,avoid digest dna interested sequence analyze b...,"[dna, biochemistry, molecular-biology]"
3,condition dendritic spine form look resource i...,[neuroscience]
4,reason behind choose reporter gene experiment ...,"[molecular-genetics, gene-expression, experime..."


### tokenize text & split/train

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions.tags)
labels = multilabel_binarizer.classes_

maxlen = 180
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df_questions.text)

def get_features(text_series):
    """
    transforms text data to feature_vectors that can be used in the ml model.
    tokenizer must be available.
    """
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)


def prediction_to_label(prediction):
    tag_prob = [(labels[i], prob) for i, prob in enumerate(prediction.tolist())]
    return dict(sorted(tag_prob, key=lambda kv: kv[1], reverse=True))

Using TensorFlow backend.


In [12]:
from sklearn.model_selection import train_test_split

x = get_features(df_questions.text)
y = multilabel_binarizer.transform(df_questions.tags)
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=9000)

(10408, 180)


### imbalanced classes

In [13]:
most_common_tags['class_weight'] = len(df_tags) / most_common_tags['count']
class_weight = {}
for index, label in enumerate(labels): 
    class_weight[index] = most_common_tags[most_common_tags['Tag'] == label]['class_weight'].values[0]
    
most_common_tags.head()

Unnamed: 0,Tag,count,class_weight
298,human-biology,1448,14.783149
253,genetics,1229,17.417413
209,evolution,1159,18.46937
51,biochemistry,984,21.754065
398,molecular-biology,863,24.804171


### build 1-d convolutional neural network

In [14]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

filter_length = 300

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

W0609 13:32:13.273573 4606662080 deprecation_wrapper.py:118] From /Users/christinachang/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0609 13:32:13.291162 4606662080 deprecation_wrapper.py:118] From /Users/christinachang/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0609 13:32:13.294455 4606662080 deprecation_wrapper.py:118] From /Users/christinachang/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0609 13:32:13.308912 4606662080 deprecation_wrapper.py:118] From /Users/christinachang/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.com

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               30100     
_________________________________________________________________
activation_1 (Activation)    (None, 100)               0         
Total params: 148,400
Trainable params: 148,400
Non-trainable params: 0
_________________________________________________________________
Trai

In [16]:
import keras.models 
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.07445679103291802
categorical_accuracy: 0.28081972459513066


In [18]:
def top_keys(q): 
    """Returns the top tags with a threshold for probability"""
    f = get_features([q])
    p1 = prediction_to_label(cnn_model.predict(f)[0])
    thresh_keys = dict((k, v) for k, v in p1.items() if v >= 0.1)
    keys = list(thresh_keys.keys())

    if len(thresh_keys) > 5:
        thresh_df = pd.DataFrame.from_dict(p1, orient='index')
        keys = list(thresh_df.nlargest(5, 0).index)

    if len(thresh_keys) == 0:
        thresh_df = pd.DataFrame.from_dict(p1, orient='index')
        keys = list(thresh_df.nlargest(1, 0).index)
        
    return keys

In [19]:
# Add CNN keys to dataframe
cnn_keys = [top_keys(i) for i in df_questions['text']]
df_questions["cnn_keys"] = cnn_keys

In [21]:
# Get unique tags
full_tags = list(df_questions['tags']) 

split = [item for sublist in full_tags for item in sublist]
unique = list(set(split))

In [25]:
def dfs(data, unique, col): 
    #split_list = [tags.split(" ") for tags in list(data['tags'])]
    #split = [item for sublist in split_list for item in sublist]
    #unique = list(set(split))
    tags_df = pd.DataFrame(columns = unique)
    data = pd.DataFrame(data)
    joined = data.join(tags_df).fillna(0)
    tags_only = joined.iloc[:, 1:np.shape(joined)[1]]
    split_list = joined[col]
    for i in range(np.shape(tags_only)[0]):
        tags_only.iloc[i].loc[split_list[i]] = 1
    #tags_only.iloc[0].loc[split_list[0]]
    #data.iloc[:, 2:np.shape(joined)[1]] = tags_only
    #final = data.join(tags_only)
    return tags_only

In [29]:
ohe_original = dfs(df_questions['tags'], unique, col = 'tags')

In [30]:
ohe_cnn = dfs(df_questions['cnn_keys'], unique, col = 'cnn_keys')

In [32]:
from sklearn.metrics import f1_score

f1_score(ohe_original, ohe_cnn, average = 'weighted')

  'precision', 'predicted', average, warn_for)


0.35444947830460954