In [157]:
import numpy as np
import pandas as pd
import pickle
import numpy as np

In [None]:
# Code is referenced from https://blog.mimacom.com/text-classification/

In [158]:
# INPUT CLEANED DATAFRAME 
#df_questions = pd.read_pickle("./cleaned_trim/biology_trim_clean.dat")
#df_questions = pd.read_pickle("./cleaned_trim/cooking_trim_clean.dat")
#df_questions = pd.read_pickle("./cleaned_trim/diy_trim_clean.dat")
df_questions = pd.read_pickle("./cleaned_trim/robotics_trim_clean.dat")

In [159]:
# make tags column into list of tags
split_list = [tags.split(" ") for tags in list(df_questions['tags'])]
df_questions['tags'] = split_list

### reduce tags

In [160]:
split = [item for sublist in split_list for item in sublist]
df_tags = pd.DataFrame(split)
df_tags.columns = ['Tag']

In [161]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')
grouped_tags.Tag.describe()

count         212
unique        212
top       labview
freq            1
Name: Tag, dtype: object

In [162]:
num_classes = 100
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

In [163]:
# only keep the tags in the top 100

# list of unique tags
unique = list(set(df_tags.Tag))
tmp = df_questions.tags[5]
print(tmp)

def tags_in_100(i, unique):
    "Input list of tags and top 100 tags. Return words in the original list that are also in top 100."
    values = [x for x in i if x in unique]
    return values

['localization', 'mobile-robot']


In [164]:
only_top_100 = [tags_in_100(i, unique) for i in df_questions.tags]
pd.DataFrame({"original" :np.array(df_questions.tags), "top100":np.array(only_top_100)}).head(10)

Unnamed: 0,original,top100
0,"[soccer, control]",[control]
1,"[control, rcservo]","[control, rcservo]"
2,"[microcontroller, arduino, raspberry-pi]","[microcontroller, arduino, raspberry-pi]"
3,"[odometry, localization, kalman-filter]","[odometry, localization, kalman-filter]"
4,[quadcopter],[quadcopter]
5,"[localization, mobile-robot]","[localization, mobile-robot]"
6,"[control, gyroscope, balance, two-wheeled]","[control, gyroscope, balance, two-wheeled]"
7,"[localization, gps]","[localization, gps]"
8,"[slam, localization, gps, mapping, acoustic-ra...","[slam, localization, gps, mapping]"
9,"[sensors, failure, motor]","[sensors, motor]"


In [165]:
# Change question df to only include top 100
df_questions['tags'] = only_top_100
df_questions.head()

Unnamed: 0,text,tags
0,right approach write spin controller soccer ro...,[control]
1,modify low cost hobby servo run get hobby serv...,"[control, rcservo]"
2,good robotics project look starting point proj...,"[microcontroller, arduino, raspberry-pi]"
3,good method tune process noise kalman filter o...,"[odometry, localization, kalman-filter]"
4,choose right combination quadcopter many site ...,[quadcopter]


### tokenize text & split/train

In [166]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_questions.tags)
labels = multilabel_binarizer.classes_

maxlen = 180
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df_questions.text)

def get_features(text_series):
    """
    transforms text data to feature_vectors that can be used in the ml model.
    tokenizer must be available.
    """
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)


def prediction_to_label(prediction):
    tag_prob = [(labels[i], prob) for i, prob in enumerate(prediction.tolist())]
    return dict(sorted(tag_prob, key=lambda kv: kv[1], reverse=True))

In [201]:
from sklearn.model_selection import train_test_split

x = get_features(df_questions.text)
y = multilabel_binarizer.transform(df_questions.tags)
print(x.shape)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=9000)

(1732, 180)


In [207]:
from sklearn.model_selection import train_test_split

x = get_features(df_questions.text)
y = multilabel_binarizer.transform(df_questions.tags)
print(x.shape)

indices = range(np.shape(x)[0])
x_train, x_test, y_train, y_test,indices_train,indices_test = train_test_split(x, y, indices,test_size=0.3, random_state=9000)

(1732, 180)


### imbalanced classes

In [199]:
most_common_tags['class_weight'] = len(df_tags) / most_common_tags['count']
class_weight = {}
for index, label in enumerate(labels): 
    class_weight[index] = most_common_tags[most_common_tags['Tag'] == label]['class_weight'].values[0]
    
most_common_tags.head()

Unnamed: 0,Tag,count,class_weight
146,quadcopter,306,13.852941
114,mobile-robot,295,14.369492
7,arduino,282,15.031915
34,control,255,16.623529
117,motor,239,17.736402


### build 1-d convolutional neural network

In [200]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam

filter_length = 300

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

callbacks = [
    ReduceLROnPlateau(), 
    EarlyStopping(patience=4), 
    ModelCheckpoint(filepath='model-conv1d.h5', save_best_only=True)
]

history = model.fit(x_train, y_train,
                    class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 180, 20)           100000    
_________________________________________________________________
dropout_9 (Dropout)          (None, 180, 20)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 178, 300)          18300     
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 300)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 100)               30100     
_________________________________________________________________
activation_9 (Activation)    (None, 100)               0         
Total params: 148,400
Trainable params: 148,400
Non-trainable params: 0
_________________________________________________________________
Trai

In [170]:
import keras.models 
cnn_model = keras.models.load_model('model-conv1d.h5')
metrics = cnn_model.evaluate(x_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.09486494660377502
categorical_accuracy: 0.17115384615384616


In [171]:
def top_keys(q): 
    """Returns the top tags with a threshold for probability"""
    f = get_features([q])
    p1 = prediction_to_label(cnn_model.predict(f)[0])
    thresh_keys = dict((k, v) for k, v in p1.items() if v >= 0.1)
    keys = list(thresh_keys.keys())

    if len(thresh_keys) > 5:
        thresh_df = pd.DataFrame.from_dict(p1, orient='index')
        keys = list(thresh_df.nlargest(5, 0).index)

    if len(thresh_keys) == 0:
        thresh_df = pd.DataFrame.from_dict(p1, orient='index')
        keys = list(thresh_df.nlargest(1, 0).index)
        
    return keys

In [172]:
# Add CNN keys to dataframe
cnn_keys = [top_keys(i) for i in df_questions['text']]
df_questions["cnn_keys"] = cnn_keys

In [180]:
df_questions.head(15)

Unnamed: 0,text,tags,cnn_keys
0,right approach write spin controller soccer ro...,[control],"[quadcopter, pid, motor, control]"
1,modify low cost hobby servo run get hobby serv...,"[control, rcservo]","[mobile-robot, robotic-arm, sensors, control, ..."
2,good robotics project look starting point proj...,"[microcontroller, arduino, raspberry-pi]","[mobile-robot, robotic-arm, sensors, ros, cont..."
3,good method tune process noise kalman filter o...,"[odometry, localization, kalman-filter]","[mobile-robot, robotic-arm, sensors, ros, cont..."
4,choose right combination quadcopter many site ...,[quadcopter],"[quadcopter, pid, control, motor]"
5,mobile robot localization known map want local...,"[localization, mobile-robot]","[mobile-robot, robotic-arm, sensors, localizat..."
6,algorithm use balance two wheel robot use gyro...,"[control, gyroscope, balance, two-wheeled]","[mobile-robot, robotic-arm, sensors, control, ..."
7,accurate way obtain locational fix use gps obv...,"[localization, gps]","[mobile-robot, robotic-arm, sensors, ros, cont..."
8,algorithm use construct map explored area use ...,"[slam, localization, gps, mapping]","[sensors, mobile-robot, robotic-arm, localizat..."
9,detect dc motor robot start fail characteristi...,"[sensors, motor]","[motor, robotic-arm, control]"


In [175]:
# Get unique tags
full_tags = list(df_questions['tags']) 

split = [item for sublist in full_tags for item in sublist]
unique = list(set(split))

In [176]:
def dfs(data, unique, col): 
    #split_list = [tags.split(" ") for tags in list(data['tags'])]
    #split = [item for sublist in split_list for item in sublist]
    #unique = list(set(split))
    tags_df = pd.DataFrame(columns = unique)
    data = pd.DataFrame(data)
    joined = data.join(tags_df).fillna(0)
    tags_only = joined.iloc[:, 1:np.shape(joined)[1]]
    split_list = joined[col]
    for i in range(np.shape(tags_only)[0]):
        tags_only.iloc[i].loc[split_list[i]] = 1
    #tags_only.iloc[0].loc[split_list[0]]
    #data.iloc[:, 2:np.shape(joined)[1]] = tags_only
    #final = data.join(tags_only)
    return tags_only

In [177]:
ohe_original = dfs(df_questions['tags'], unique, col = 'tags')

In [178]:
ohe_cnn = dfs(df_questions['cnn_keys'], unique, col = 'cnn_keys')

In [179]:
from sklearn.metrics import f1_score

f1_score(ohe_original, ohe_cnn, average = 'weighted')

  'precision', 'predicted', average, warn_for)


0.19822162500004403

In [210]:
tmp1 = ohe_original.iloc[indices_test, :]
tmp2 = ohe_cnn.iloc[indices_test, :]
f1_score(tmp1, tmp2, average = 'weighted')

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.18144504922896626