In [1]:
import numpy as np
import pandas as pd
import pickle
#import re
#import nltk
#import nltk.corpus
import numpy as np
#from nltk.corpus import wordnet
#import matplotlib.pyplot as plt
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.metrics.pairwise import cosine_similarity
#from nltk.stem.snowball import SnowballStemmer

In [506]:
# read the data for bio
df_questions = pd.read_pickle("./sta-141c-classify/cleaned_trim/biology_trim_clean.dat")

In [507]:
# make tags column into list of tags
split_list = [tags.split(" ") for tags in list(df_questions['tags'])]
df_questions['tags'] = split_list
df_questions.head(2)

Unnamed: 0,text,tags
0,rnase contamination rna base experiment prevent anyone suggestion prevent rnase contamination work rna tend issue degradation regardless whether use depc treat rnase free water filter pipette tip,"[rna, biochemistry]"
1,lymphocyte size cluster two group tortora write principle anatomy physiology lymphocyte may small μm diameter large μm range quite close others take mean lymphocytes size cluster two group way say lymphocyte μm,"[immunology, cell-biology, hematology]"


### reduce tags

In [496]:
#split_list = [tags.split(" ") for tags in list(df_questions['tags'])]
split = [item for sublist in split_list for item in sublist]
#unique = list(set(split))
df_tags = pd.DataFrame(split)
df_tags.columns = ['Tag']

In [499]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')
grouped_tags.Tag.describe()

count             100
unique            100
top       ornithology
freq                1
Name: Tag, dtype: object

In [498]:
#unique = list(set(split))
#len(unique)
#-------

num_classes = 100
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

In [514]:
# only keep the tags in the top 100

# list of unique tags
unique = list(set(df_tags.Tag))
tmp = df_questions.tags[5]
print(tmp)

def tags_in_100(i, unique):
    "Input list of tags and top 100 tags. Return words in the original list that are also in top 100."
    values = [x for x in i if x in unique]
    return values

['evolution', 'mitochondria', 'chloroplasts']


list

In [515]:
only_top_100 = [tags_in_100(i, unique) for i in df_questions.tags]
pd.DataFrame({"original" :np.array(df_questions.tags), "top100":np.array(only_top_100)}).head()

Unnamed: 0,original,top100
0,"[rna, biochemistry]","[rna, biochemistry]"
1,"[immunology, cell-biology, hematology]","[immunology, cell-biology, hematology]"
2,"[dna, biochemistry, molecular-biology]","[dna, biochemistry, molecular-biology]"
3,"[neuroscience, synapses]",[neuroscience]
4,"[molecular-genetics, gene-expression, experimental-design]","[molecular-genetics, gene-expression, experimental-design]"


In [516]:
# Change question df to only include top 100
df_questions['tags'] = only_top_100
df_questions.head()

Unnamed: 0,text,tags
0,rnase contamination rna base experiment prevent anyone suggestion prevent rnase contamination work rna tend issue degradation regardless whether use depc treat rnase free water filter pipette tip,"[rna, biochemistry]"
1,lymphocyte size cluster two group tortora write principle anatomy physiology lymphocyte may small μm diameter large μm range quite close others take mean lymphocytes size cluster two group way say lymphocyte μm,"[immunology, cell-biology, hematology]"
2,avoid digest dna interested sequence analyze bound dna minimize amount unbound dna get sequence digest dna unbound dna digest way maximize amount unbound dna digest,"[dna, biochemistry, molecular-biology]"
3,condition dendritic spine form look resource information formation dendritic spine synaptogenesis especially relation new connection form daily electrotonic signalling along axon spine cause new connection make base kind spatial condition maybe electrical chemical attraction large heuristic,[neuroscience]
4,reason behind choose reporter gene experiment gene interest notice within example experiment class different reporter gene choose insert near gene interest prove whether gene express example may insert gene fluorescence next gene interest know transcribe whether organism cell fluorescent degree fluoresce notice experiment multiple version one case use fluorescent gene next different gene examp...,"[molecular-genetics, gene-expression, experimental-design]"


### tokenize text & split/train

In [517]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions.tags)
labels = multilabel_binarizer.classes_

maxlen = 180
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df_questions.text)

def get_features(text_series):
    """
    transforms text data to feature_vectors that can be used in the ml model.
    tokenizer must be available.
    """
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)


def prediction_to_label(prediction):
    tag_prob = [(labels[i], prob) for i, prob in enumerate(prediction.tolist())]
    return dict(sorted(tag_prob, key=lambda kv: kv[1], reverse=True))

In [518]:
len(list(set(labels)))

100

In [519]:
from sklearn.model_selection import train_test_split

x = get_features(df_questions.text)
y = multilabel_binarizer.transform(df_questions.tags)
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=9000)

(10408, 180)


### imbalanced classes

In [522]:
most_common_tags['class_weight'] = len(df_tags) / most_common_tags['count']
class_weight = {}
for index, label in enumerate(labels): 
    class_weight[index] = most_common_tags[most_common_tags['Tag'] == label]['class_weight'].values[0]
    
most_common_tags.head()

Unnamed: 0,Tag,count,class_weight
298,human-biology,1448,14.783149
253,genetics,1229,17.417413
209,evolution,1159,18.46937
51,biochemistry,984,21.754065
398,molecular-biology,863,24.804171


### build 1-d convolutional neural network

In [524]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

filter_length = 300

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_8 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 300)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 100)               30100     
_________________________________________________________________
activation_8 (Activation)    (None, 100)               0         
Total params: 148,400
Trainable params: 148,400
Non-trainable params: 0
_________________________________________________________________
Trai

In [526]:
import keras.models 
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.07273883121091129
categorical_accuracy: 0.3105987832403473


In [398]:
df_questions = pd.read_csv('../../Downloads/statsquestions/Questions.csv', encoding='iso-8859-1')
df_tags = pd.read_csv('../../Downloads/statsquestions/Tags.csv', encoding='iso-8859-1')
df_questions.head(n=2)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Machine Learning, fight!""</a> that discussed some of the differences between the two fields. <a href=""http://andrewgelman.com/2008/12/machine_learnin/"">Andrew Gelman responded favorably to this</a>:</p>\..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,"<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\nareas are a lot larger than condensed\nurban areas. Is there a need to account for the area size difference?</li>\n<li>if let's say I have census data\ndating back to 4 - 5 census periods,\nhow far ca..."


In [374]:
print(np.shape(df_questions), np.shape(df_tags))
len(set(df_tags.Tag))

(85085, 6) (244228, 2)


1315

In [399]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')
grouped_tags.Tag.describe()

count     1315
unique    1315
top       mase
freq         1
Name: Tag, dtype: object

In [400]:
num_classes = 100
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

In [379]:
len(df_tags)
len(set(df_tags.Tag))

100

In [401]:
import re 

def strip_html_tags(body):
    regex = re.compile('<.*?>')
    return re.sub(regex, '', body)

df_questions['Body'] = df_questions['Body'].apply(strip_html_tags)
df_questions['Text'] = df_questions['Title'] + ' ' + df_questions['Body']

In [None]:
# denormalize tables

def tags_for_question(question_id):
    return df_tags[df_tags['Id'] == question_id].Tag.values

def add_tags_column(row):
    row['Tags'] = tags_for_question(row['Id'])
    return row

df_questions1 = df_questions.apply(add_tags_column, axis=1)

In [358]:
np.shape(df_questions1)

(85085, 8)

In [403]:
pd.set_option('display.max_colwidth', 400)
df_questions1[['Id', 'Text', 'Tags']].head(2)
#pd.DataFrame(df_questions.tag, df_questions1.Tags)
df_questions1.Tags

0                                                                               [machine-learning]
1                                                                                    [forecasting]
2                                                                                       [bayesian]
3                                            [hypothesis-testing, t-test, p-value, interpretation]
4                                                                                    [correlation]
5                                                                        [nonparametric, survival]
6                                                                                    [time-series]
7                                                                 [data-visualization, references]
8                                                                               [machine-learning]
9                                                                                     [references]
10        

In [250]:
df_questions.head(2)

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body,Text,Tags
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learning?,"Last year, I read a blog post from Brendan O'Connor entitled ""Statistics vs. Machine Learning, fight!"" that discussed some of the differences between the two fields. Andrew Gelman responded favorably to this:\n\nSimon Blomberg: \n\n\n From R's fortunes\n package: To paraphrase provocatively,\n 'machine learning is statistics minus\n any checking of models and\n assumptions'.\n -- Brian ...","The Two Cultures: statistics vs. machine learning? Last year, I read a blog post from Brendan O'Connor entitled ""Statistics vs. Machine Learning, fight!"" that discussed some of the differences between the two fields. Andrew Gelman responded favorably to this:\n\nSimon Blomberg: \n\n\n From R's fortunes\n package: To paraphrase provocatively,\n 'machine learning is statistics minus\n any c...",[machine-learning]
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,"What are some of the ways to forecast demographic census with some validation and calibration techniques?\n\nSome of the concerns:\n\n\nCensus blocks vary in sizes as rural\nareas are a lot larger than condensed\nurban areas. Is there a need to account for the area size difference?\nif let's say I have census data\ndating back to 4 - 5 census periods,\nhow far can i forecast it into the\nfutur...","Forecasting demographic census What are some of the ways to forecast demographic census with some validation and calibration techniques?\n\nSome of the concerns:\n\n\nCensus blocks vary in sizes as rural\nareas are a lot larger than condensed\nurban areas. Is there a need to account for the area size difference?\nif let's say I have census data\ndating back to 4 - 5 census periods,\nhow far ca...",[forecasting]


In [255]:
df_questions['Tags'][3]

array(['hypothesis-testing', 't-test', 'p-value', 'interpretation'],
      dtype=object)

In [333]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions.Tags)
labels = multilabel_binarizer.classes_

maxlen = 180
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df_questions.Text)

def get_features(text_series):
    """
    transforms text data to feature_vectors that can be used in the ml model.
    tokenizer must be available.
    """
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)


def prediction_to_label(prediction):
    tag_prob = [(labels[i], prob) for i, prob in enumerate(prediction.tolist())]
    return dict(sorted(tag_prob, key=lambda kv: kv[1], reverse=True))

In [276]:
len(df_questions.Tags)
#df_questions.Tags.ravel()
df_questions.Tags.values.flatten()
len(np.unique(np.concatenate(list(df_questions.Tags))))

100

In [261]:
labels

array(['algorithms', 'anova', 'arima', 'autocorrelation', 'bayesian',
       'binary-data', 'binomial', 'bootstrap', 'cart', 'categorical-data',
       'chi-squared', 'classification', 'clustering',
       'conditional-probability', 'confidence-interval', 'correlation',
       'covariance', 'cox-model', 'cross-validation', 'data-mining',
       'data-transformation', 'data-visualization', 'dataset',
       'deep-learning', 'distributions', 'econometrics', 'estimation',
       'expected-value', 'experiment-design', 'factor-analysis',
       'feature-selection', 'forecasting', 'generalized-linear-model',
       'goodness-of-fit', 'hypothesis-testing', 'inference',
       'interaction', 'interpretation', 'least-squares', 'linear-model',
       'logistic', 'machine-learning', 'mathematical-statistics',
       'matlab', 'maximum-likelihood', 'mcmc', 'mean', 'missing-data',
       'mixed-model', 'model', 'model-selection', 'modeling',
       'monte-carlo', 'multilevel-analysis', 'multiple-co

In [277]:
from sklearn.model_selection import train_test_split

x = get_features(df_questions.Text)
y = multilabel_binarizer.transform(df_questions.Tags)
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=9000)

(85085, 180)


In [278]:
most_common_tags['class_weight'] = len(df_tags) / most_common_tags['count']
class_weight = {}
for index, label in enumerate(labels):
    class_weight[index] = most_common_tags[most_common_tags['Tag'] == label]['class_weight'].values[0]
    
most_common_tags.head()

Unnamed: 0,Tag,count,class_weight
986,r,13236,11.552811
1020,regression,10959,13.953189
669,machine-learning,6089,25.112991
1220,time-series,5559,27.507285
946,probability,4217,36.261086


In [279]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

filter_length = 300

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 300)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 100)               30100     
_________________________________________________________________
activation_5 (Activation)    (None, 100)               0         
Total params: 148,400
Trainable params: 148,400
Non-trainable params: 0
_________________________________________________________________
Inst

KeyboardInterrupt: 