In [44]:
##Structured Prediction using conditional random field

Import necessary libraries. CRF is implemented in sklearn_crfsuite

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
import scipy
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import pickle

A function to convert the text data into a format suitable for structured prediction using conditional random field. First, text data is converted into feature vector using 
tfidf vectorizer. Then feature vectors corresponding to messages within same threads are grouped together.

In [51]:
def create_features_for_structured_prediction(df, text_data_column_name, group_by_column_name, label_available=True, tfidf_transformer=None):

    list_feature_dict = []
    for text in df[text_data_column_name]:
        feature_dict = tfidf_transformer.transform([text]).todok()
        feature_dict_n = {}
        for key,value in feature_dict.items():
            feature_dict_n[str(key)] = value
        list_feature_dict.append(feature_dict_n)
    df['tfidf_features'] = list_feature_dict
    df_group_by_thread = df.groupby(group_by_column_name)
    X_all = [] # Each element is a list of features corresponding to messages in a thread
    if label_available : y_all = [] # Each element is a list of labels assigned to messages in a thread

    for name, group in df_group_by_thread:
    #print(name)
        X_cur = []
        if label_available : y_cur = []
        for ind, row in group.iterrows():
            X_cur.append(row['tfidf_features'])
            if label_available : y_cur.append(row['majority_type'])   
        X_all.append(X_cur)
        if label_available : y_all.append(y_cur)
    if label_available : return X_all, y_all
    return X_all

Load the reddit thread discussion dataset with discourse act labels. The dataset is used for training a model and hyperparamter selection.

In [52]:
reddit_df = pd.read_json('coarse_discourse_reddit_filtered.json')
#reddit_df.sort_values(by=['title'], inplace=True)

In [53]:
labels = list(reddit_df.majority_type.unique())

converting text data to feature vectors. Same transformer is used in prediction phase.

In [54]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r"(?u)\b\w\w+\b|!|\?", max_features=1000) #token pattern because we want to preserve ? and !
tfidf_transformer = tfidf_vectorizer.fit(reddit_df['body'])

In [None]:
Load the github dataset to make prediction on

In [55]:
git_df = pd.read_csv("repository_messages_all.csv",error_bad_lines=False,delimiter=',')
git_df.sort_values(by=['thread_id'], inplace=True)

In [56]:
X_all, y_all = create_features_for_structured_prediction(reddit_df,'body','title', True, tfidf_transformer)

Split the dataset into training set and validation set

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2)

Train the crf model with lbfgs algorithm. The hyperparameter search is perfomed to select the best model using 3 fold cross validation 

In [58]:

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}



# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
crf_models = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
crf_models.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 34.9min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000029080E20>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000000272F2190>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['announcement', 'elaboration', 'appreciation', 'question', 'answer', 'agreement', 'negativereaction', 'disagreement', 'humor', 'other']),
                   verbose=1)

In [59]:
print('best params:', crf_models.best_params_)
print('best CV score:', crf_models.best_score_)
crf = crf_models.best_estimator_

best params: {'c1': 0.2880636867671778, 'c2': 0.010811969362834552}
best CV score: 0.5318877377286336


In [60]:
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=labels, digits=3
))



                  precision    recall  f1-score   support

    announcement      0.571     0.384     0.460       385
     elaboration      0.366     0.148     0.211      3663
    appreciation      0.743     0.599     0.663      1679
        question      0.776     0.783     0.779      3520
          answer      0.551     0.881     0.678      7870
       agreement      0.592     0.147     0.236      1005
negativereaction      0.263     0.030     0.054       332
    disagreement      0.161     0.023     0.040       651
           humor      0.141     0.052     0.076       424
           other      0.160     0.045     0.070       334

        accuracy                          0.584     19863
       macro avg      0.432     0.309     0.327     19863
    weighted avg      0.542     0.584     0.528     19863



prepare the feature set for github data

In [61]:
tfidf_transformer = tfidf_vectorizer.fit(reddit_df['body'])
X_git = create_features_for_structured_prediction(git_df,'msg_text','thread_id', False, tfidf_transformer)

using the best crf model selected in model selection phase, make prediction on data. Since the output is list of lists, we need to flatten the final prediction vector

In [62]:
y_pred_git = crf.predict(X_git)
y_pred_git_flat = [label for group in y_pred_git for label in group]
git_df = git_df.drop(columns=['tfidf_features'])
git_df['discourse_act'] = y_pred_git_flat

In [63]:
git_df

Unnamed: 0,repo_group_id,repo_id,repo_git,repo_name,thread_id,msg_text,thread_title,msg_id,discourse_act
403297,25179,27239,https://github.com/pivotal-cf/cf-redis-release,cf-redis-release,209024,We've recently added spiff templates for Vsphe...,Template examples for spiff,1691707,elaboration
94328,25179,27239,https://github.com/pivotal-cf/cf-redis-release,cf-redis-release,209025,Merged into develop. Will appear in master onc...,Bosh lite manifest update,1691709,question
74559,25179,27239,https://github.com/pivotal-cf/cf-redis-release,cf-redis-release,209025,We have created an issue in Pivotal Tracker to...,Bosh lite manifest update,1691708,answer
17489,25179,27239,https://github.com/pivotal-cf/cf-redis-release,cf-redis-release,209026,We have created an issue in Pivotal Tracker to...,Add IAM policy for backup user,1691710,question
59301,25179,27239,https://github.com/pivotal-cf/cf-redis-release,cf-redis-release,209026,We're going to find a less open policy to reco...,Add IAM policy for backup user,1691711,answer
...,...,...,...,...,...,...,...,...,...
139873,25160,25942,https://github.com/spring-guides/gs-spring-boo...,gs-spring-boot-docker,559367,"You could use the ""shell"" form of the `ENTRYPO...",Update this guide with Boot 2.3 features,2119054,elaboration
166534,25160,25942,https://github.com/spring-guides/gs-spring-boo...,gs-spring-boot-docker,559367,"Thanks anyway, in fact, I wanna know why don't...",Update this guide with Boot 2.3 features,2119055,question
382475,25160,25942,https://github.com/spring-guides/gs-spring-boo...,gs-spring-boot-docker,559367,"Yes, I know you can use the ""shell"" form like...",Update this guide with Boot 2.3 features,2119058,answer
62129,25160,25942,https://github.com/spring-guides/gs-spring-boo...,gs-spring-boot-docker,559367,For reasons explained in the docker documentat...,Update this guide with Boot 2.3 features,2119056,answer
