In [3]:
import markdown2
from markdown2 import Markdown
from bs4 import BeautifulSoup, NavigableString, Tag
import nltk
from nltk import word_tokenize          
from nltk.stem.porter import PorterStemmer
import re
import html2text
import pandas as pd

## New + Original data

In [4]:
df_all = pd.read_csv('all_data.csv')

In [5]:
df_all.head()

Unnamed: 0.1,Unnamed: 0,file_id,section_id,url,heading_text,content_text_w_o_tags,abstracted_heading_plus_content,section_code
0,0,3,20,https://github.com/sethm/symon,1.0 About,Symon is a general purpose simulator for syste...,@abstr_number . @abstr_number About Symon is ...,1146
1,1,3,21,https://github.com/sethm/symon,2.0 Requirements,Java @abstr_number . @abstr_number or higher ...,@abstr_number . @abstr_number Requirements J...,3
2,2,3,22,https://github.com/sethm/symon,3.0 Features,Symon can simulate multiple @abstr_number base...,@abstr_number . @abstr_number Features Symon ...,1
3,3,3,23,https://github.com/sethm/symon,3.1 Memory Maps,,@abstr_number . @abstr_number Memory Maps,1
4,4,3,24,https://github.com/sethm/symon,3.1.1 Symon Memory Map,$ @abstr_number--$ @abstr_number FFF: @abstr_...,@abstr_number . @abstr_number . @abstr_number ...,1


### Derive statistical features

In [190]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words='english')
tfidfX = tfidf.fit_transform(df_all["abstracted_heading_plus_content"])
        


In [191]:
tfidfX.shape

(5340, 18705)

In [192]:
 features_tfidf = pandas.DataFrame(tfidfX.todense())
        # Assign column names to make it easier to print most useful features later
features_tfidf.columns = tfidf.get_feature_names()


### Derive heuristic features

In [193]:
'''
Module for heuristic functions.
Naming convention:
heur_<x>_<y>_<number>
<x> = c if meant for use in content text, h if meant for use with header
<y> = k if uses keyword matching, s if uses text statistics (e.g. word count), c if combination
'''
import nltk
import re
import pandas
from pandas import DataFrame

def heur_c_k_001(input_text):
    if ('report bugs' in input_text.lower()) or ('reporting bugs' in input_text.lower()):
        return 1
    else:
        return 0

def heur_c_k_002(input_text):
    if ' is a ' in input_text.lower():
        return 1
    else:
        return 0

def heur_c_k_003(input_text):
    if '@abstr_code_section' in input_text.lower():
        return 1
    else:
        return 0 
    
def heur_c_k_004(input_text):
    if 'attempts to' in input_text.lower():
        return 1
    else:
        return 0 

def heur_c_k_005(input_text):
    if 'inspired by' in input_text.lower():
        return 1
    else:
        return 0 

def heur_c_k_006(input_text):
    if 'install ' in input_text.lower():
        return 1
    else:
        return 0 

def heur_c_k_007(input_text):
    if 'reasons' in input_text.lower():
        return 1
    else:
        return 0

# Do not use lower() because we want to capture "Added" at beginning of sentence
def heur_c_k_008(input_text):
    if 'Added ' in input_text:
        return 1
    else:
        return 0

def heur_c_k_009(input_text):
    if 'copyright' in input_text.lower():
        return 1
    else:
        return 0

def heur_c_k_010(input_text):
    if '@abstr_mailto' in input_text.lower():
        return 1
    else:
        return 0

def heur_c_k_011(input_text):
    if 'you can ' in input_text.lower():
        return 1
    else:
        return 0

# Check if the text comprises solely of @abstr_code_section
def heur_c_k_012(input_text):
    if '@abstr_code_section' == input_text.lower().strip():
        return 1
    else:
        return 0

def heur_c_k_013(input_text):
    if 'About' in input_text:
        return 1
    else:
        return 0
    
def heur_c_k_014(input_text):
    if 'be sure to' in input_text.lower():
        return 1
    else:
        return 0    
    
def heur_c_k_015(input_text):
    if 'Download' in input_text:
        return 1
    else:
        return 0    
    
def heur_c_k_016(input_text):
    if 'overview' in input_text.lower():
        return 1
    else:
        return 0    
    
def heur_c_k_017(input_text):
    if 'get started' in input_text.lower():
        return 1
    else:
        return 0    
    
def heur_c_k_018(input_text):
    if 'reasons' in input_text.lower():
        return 1
    else:
        return 0   
    
def heur_c_k_019(input_text):
    if 'dependenc' in input_text.lower():
        return 1
    else:
        return 0   
    
def heur_c_k_020(input_text):
    if 'rerun' in input_text.lower():
        return 1
    else:
        return 0   
    
def heur_c_k_021(input_text):
    if 'you''ll be able' in input_text.lower():
        return 1
    else:
        return 0   
    
def heur_c_k_022(input_text):
    if 'you must' in input_text.lower():
        return 1
    else:
        return 0   
    
def heur_c_k_023(input_text):
    if 'previous version' in input_text.lower():
        return 1
    else:
        return 0   

def heur_h_k_001(input_text):
    if 'configur' in input_text.lower():
        return 1
    else:
        return 0 
     
def heur_h_k_002(input_text):
    if 'what' in input_text.lower():
        return 1
    else:
        return 0 
     
def heur_h_k_003(input_text):
    if 'why' in input_text.lower():
        return 1
    else:
        return 0   

def heur_h_k_004(input_text):
    if 'approach' in input_text.lower():
        return 1
    else:
        return 0   

def heur_h_k_005(input_text):
    if 'bugs' in input_text.lower():
        return 1
    else:
        return 0  
    
def heur_h_k_006(input_text):
    if 'contrib' in input_text.lower():
        return 1
    else:
        return 0         
    
def heur_h_k_007(input_text):
    if 'credit' in input_text.lower():
        return 1
    else:
        return 0            
    
def heur_h_k_008(input_text):
    if 'feature' in input_text.lower():
        return 1
    else:
        return 0              
    
def heur_h_k_009(input_text):
    if 'install' in input_text.lower():
        return 1
    else:
        return 0               
    
def heur_h_k_010(input_text):
    if 'intro' in input_text.lower():
        return 1
    else:
        return 0                 
    
def heur_h_k_011(input_text):
    if 'licen' in input_text.lower():
        return 1
    else:
        return 0             
    
def heur_h_k_012(input_text):
    if 'objective' in input_text.lower():
        return 1
    else:
        return 0      
    
def heur_h_k_013(input_text):
    if 'request' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_014(input_text):
    if 'requirement' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_015(input_text):
    if 'resource' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_016(input_text):
    if 'setting' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_017(input_text):
    if 'setup' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_018(input_text):
    if 'started' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_019(input_text):
    if 'usage' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_020(input_text):
    if 'version' in input_text.lower():
        return 1
    else:
        return 0    
    
def heur_h_k_021(input_text):
    if 'welcome' in input_text.lower():
        return 1
    else:
        return 0    
    
def heur_h_k_022(input_text):
    if 'what is' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_023(input_text):
    if 'overview' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_024(input_text):
    if 'basic' in input_text.lower():
        return 1
    else:
        return 0      
    
def heur_h_k_025(input_text):
    if 'roadmap' in input_text.lower():
        return 1
    else:
        return 0      
    
def heur_h_k_026(input_text):
    if 'todo' in input_text.lower():
        return 1
    else:
        return 0        
    
def heur_h_k_027(input_text):
    if 'example' in input_text.lower():
        return 1
    else:
        return 0         
    
def heur_h_k_028(input_text):
    if 'about' in input_text.lower():
        return 1
    else:
        return 0     
    
def heur_h_k_029(input_text):
    if 'reference' in input_text.lower():
        return 1
    else:
        return 0   
    
# Return 1 if string is single non-English word
# WARNING: For speed, instantiate the word set outside and pass it as parameter,
# so that instantiation will be done once for each program run
# instead of once for each row this heuristic is applied to
def heur_h_c_001(input_text, words=None):
    lcase_input_text = input_text.lower()
    s = lcase_input_text.split(' ')
    if len(s) != 1: 
        return 0
    else:        
        if words is None:
            words = set(nltk.corpus.words.words())
        if s[0] in words: 
            return 0
        else:
            return 1
    return 0

# Returns 1 if a word in heading text is in repo name, or the other way around
def heur_h_c_002(heading_text, repo_url):
    heading_words = heading_text.lower().split(' ')
    repo_name = re.sub(r"http(s)*:\/\/www\.github\.com\/.+\/",'',repo_url)
    repo_name = repo_name.replace('.',' ').replace('-',' ')
    repo_name_words = repo_name.lower().split(' ')
    match = [val for val in heading_words if val in repo_name_words]
    if len(match) >0:
        return 1
    else:
        return 0
    
# See whether text comprises entirely of ASCII characters    
def heur_c_s_001(input_text):
    try:
        input_text.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return 0
    else:
        return 1   

# Generate DataFrame of derived features using DataFrames of some initial features    
def derive_features_using_heuristics(url_corpus, heading_text_corpus, content_corpus):
    derived_features = DataFrame()
    derived_features['heur_c_k_001'] = [heur_c_k_001(x) for x in content_corpus]
    derived_features['heur_c_k_002'] = [heur_c_k_002(x) for x in content_corpus]
    derived_features['heur_c_k_003'] = [heur_c_k_003(x) for x in content_corpus]
    derived_features['heur_c_k_004'] = [heur_c_k_004(x) for x in content_corpus]
    derived_features['heur_c_k_005'] = [heur_c_k_005(x) for x in content_corpus]
    derived_features['heur_c_k_006'] = [heur_c_k_006(x) for x in content_corpus]
    derived_features['heur_c_k_007'] = [heur_c_k_007(x) for x in content_corpus]
    derived_features['heur_h_k_001'] = [heur_h_k_001(x) for x in heading_text_corpus]
    derived_features['heur_h_k_002'] = [heur_h_k_002(x) for x in heading_text_corpus]
    derived_features['heur_h_k_003'] = [heur_h_k_003(x) for x in heading_text_corpus]
    derived_features['heur_h_k_004'] = [heur_h_k_004(x) for x in heading_text_corpus]
    derived_features['heur_h_k_005'] = [heur_h_k_005(x) for x in heading_text_corpus]
    derived_features['heur_h_k_006'] = [heur_h_k_006(x) for x in heading_text_corpus]
    derived_features['heur_h_k_007'] = [heur_h_k_007(x) for x in heading_text_corpus]
    derived_features['heur_h_k_008'] = [heur_h_k_008(x) for x in heading_text_corpus]
    derived_features['heur_h_k_009'] = [heur_h_k_009(x) for x in heading_text_corpus]
    derived_features['heur_h_k_010'] = [heur_h_k_010(x) for x in heading_text_corpus]
    derived_features['heur_h_k_011'] = [heur_h_k_011(x) for x in heading_text_corpus]
    derived_features['heur_h_k_012'] = [heur_h_k_012(x) for x in heading_text_corpus]
    derived_features['heur_h_k_013'] = [heur_h_k_013(x) for x in heading_text_corpus]
    derived_features['heur_h_k_014'] = [heur_h_k_014(x) for x in heading_text_corpus]
    derived_features['heur_h_k_015'] = [heur_h_k_015(x) for x in heading_text_corpus]
    derived_features['heur_h_k_016'] = [heur_h_k_016(x) for x in heading_text_corpus]
    derived_features['heur_h_k_017'] = [heur_h_k_017(x) for x in heading_text_corpus]
    derived_features['heur_h_k_018'] = [heur_h_k_018(x) for x in heading_text_corpus]
    derived_features['heur_h_k_019'] = [heur_h_k_019(x) for x in heading_text_corpus]
    derived_features['heur_h_k_020'] = [heur_h_k_020(x) for x in heading_text_corpus]
    derived_features['heur_h_k_021'] = [heur_h_k_021(x) for x in heading_text_corpus]
    derived_features['heur_h_k_022'] = [heur_h_k_022(x) for x in heading_text_corpus]
    derived_features['heur_c_s_001'] = [heur_c_s_001(x) for x in heading_text_corpus]
    
    # Batch 02
    derived_features['heur_c_k_008'] = [heur_c_k_008(x) for x in content_corpus]
    derived_features['heur_c_k_009'] = [heur_c_k_009(x) for x in content_corpus]
    derived_features['heur_c_k_010'] = [heur_c_k_010(x) for x in content_corpus]
    derived_features['heur_c_k_011'] = [heur_c_k_011(x) for x in content_corpus]
    derived_features['heur_c_k_012'] = [heur_c_k_012(x) for x in content_corpus]
    derived_features['heur_c_k_013'] = [heur_c_k_013(x) for x in content_corpus]
    derived_features['heur_h_k_023'] = [heur_h_k_023(x) for x in heading_text_corpus]
    derived_features['heur_h_k_024'] = [heur_h_k_024(x) for x in heading_text_corpus]
    derived_features['heur_h_k_025'] = [heur_h_k_025(x) for x in heading_text_corpus]
    derived_features['heur_h_k_026'] = [heur_h_k_026(x) for x in heading_text_corpus]
    derived_features['heur_h_k_027'] = [heur_h_k_027(x) for x in heading_text_corpus]
    words = set(nltk.corpus.words.words())
    derived_features['heur_h_c_001'] = [heur_h_c_001(x, words) for x in heading_text_corpus]
    derived_features['heur_h_c_002'] = [heur_h_c_002(x,y) for x,y in zip(heading_text_corpus, url_corpus)]

    # Batch 03
    derived_features['heur_c_k_014'] = [heur_c_k_014(x) for x in content_corpus]
    derived_features['heur_c_k_015'] = [heur_c_k_015(x) for x in content_corpus]
    derived_features['heur_c_k_016'] = [heur_c_k_016(x) for x in content_corpus]
    derived_features['heur_c_k_017'] = [heur_c_k_017(x) for x in content_corpus]
    derived_features['heur_c_k_018'] = [heur_c_k_018(x) for x in content_corpus]
    derived_features['heur_c_k_019'] = [heur_c_k_019(x) for x in content_corpus]
    derived_features['heur_c_k_020'] = [heur_c_k_020(x) for x in content_corpus]
    derived_features['heur_c_k_021'] = [heur_c_k_021(x) for x in content_corpus]
    derived_features['heur_c_k_022'] = [heur_c_k_022(x) for x in content_corpus]
    derived_features['heur_c_k_023'] = [heur_c_k_023(x) for x in content_corpus]
    derived_features['heur_h_k_028'] = [heur_h_k_028(x) for x in heading_text_corpus]
    derived_features['heur_h_k_029'] = [heur_h_k_029(x) for x in heading_text_corpus]
    
    return derived_features

In [194]:
df_all.columns

Index(['Unnamed: 0', 'file_id', 'section_id', 'url', 'heading_text',
       'content_text_w_o_tags', 'abstracted_heading_plus_content',
       'section_code'],
      dtype='object')

In [195]:
df_heuristics = derive_features_using_heuristics(df_all["url"],df_all["heading_text"] , df_all["content_text_w_o_tags"])

In [196]:
df_heuristics.head()

Unnamed: 0,heur_c_k_001,heur_c_k_002,heur_c_k_003,heur_c_k_004,heur_c_k_005,heur_c_k_006,heur_c_k_007,heur_h_k_001,heur_h_k_002,heur_h_k_003,...,heur_c_k_016,heur_c_k_017,heur_c_k_018,heur_c_k_019,heur_c_k_020,heur_c_k_021,heur_c_k_022,heur_c_k_023,heur_h_k_028,heur_h_k_029
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### combining the features

In [197]:
features_combined = pandas.concat([features_tfidf, df_heuristics], axis=1)

In [198]:
features_combined.head()

Unnamed: 0,10,2gb,32fbc617fdbf4085ec47f8d7847a7e1d,3d,__,___,____,_____,______,_______________________________,...,heur_c_k_016,heur_c_k_017,heur_c_k_018,heur_c_k_019,heur_c_k_020,heur_c_k_021,heur_c_k_022,heur_c_k_023,heur_h_k_028,heur_h_k_029
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### label set

In [199]:
from sklearn.preprocessing import MultiLabelBinarizer
# Class '2' has been merged into class '1'
label_set = ['-','1','3','4','5','6','7','8']
labels = [str(x).split(',') for x in df_all['section_code']]
mlb = MultiLabelBinarizer(classes=label_set)
labels_matrix = mlb.fit_transform(labels)

  .format(sorted(unknown, key=str)))


In [200]:
# labels 

### split into train and test set

In [201]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_combined, \
                                                    labels_matrix, test_size = 0.25, random_state=42)

### model

In [38]:
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from joblib import Parallel
from joblib import delayed
from sklearn.utils import resample
from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator, clone
import warnings


class _ConstantPredictor(BaseEstimator):

    def fit(self, X, y):
        self.y_ = y
        return self

    def predict(self, X):
        check_is_fitted(self, 'y_')

        return np.repeat(self.y_, X.shape[0])

    def decision_function(self, X):
        check_is_fitted(self, 'y_')

        return np.repeat(self.y_, X.shape[0])

    def predict_proba(self, X):
        check_is_fitted(self, 'y_')

        return np.repeat([np.hstack([1 - self.y_, self.y_])],
                         X.shape[0], axis=0)

def _fit_binary(estimator, X, y, classes=None):
    """Fit a single binary estimator."""
    unique_y = np.unique(y)
    if len(unique_y) == 1:
        if classes is not None:
            if y[0] == -1:
                c = 0
            else:
                c = y[0]
            warnings.warn("Label %s is present in all training examples." %
                          str(classes[c]))
        estimator = _ConstantPredictor().fit(X, unique_y)
    else:
        estimator = clone(estimator)
        estimator.fit(X, y)
    return estimator

class OneVsRestClassifierBalance(OneVsRestClassifier):
    
    def fit(self, X, y):
        self.label_binarizer_ = LabelBinarizer(sparse_output=True)
        Y = self.label_binarizer_.fit_transform(y)
        Y = Y.tocsc()
        self.classes_ = self.label_binarizer_.classes_
        totalIns = Y.shape[0]
        XBal = []
        YBal = []
        for i in range(len(self.label_binarizer_.classes_)):
            if len(y.shape)>1:
                # Matrix
                curIdxs = Y[:,i].nonzero()[0]
            else:
                curIdxs = Y.nonzero()[0]
            baseX = X[curIdxs,:]
            if len(y.shape)>1:
                # Matrix
                baseY = y[curIdxs,:]
            else:
                # array, e.g. due to testing classifier performance for single label prediction
                baseY = y[curIdxs]
            tempX = X
            tempY = y
            imbalancedIns = baseX.shape[0]
            numDup = totalIns/imbalancedIns - 1
            for j in range(int(numDup)):
                tempX = np.vstack((tempX,baseX))
                if len(y.shape)>1:
                    tempY = np.vstack((tempY,baseY))
                else:
                    tempY = np.concatenate((tempY, baseY))
            numAdd = totalIns%imbalancedIns
            tempX = np.vstack((tempX,resample(baseX,n_samples=numAdd,random_state=0)))
            if len(y.shape)>1:
                tempY = np.vstack((tempY,resample(baseY,n_samples=numAdd,random_state=0)))
            else:
                tempY = np.concatenate((tempY,resample(baseY,n_samples=numAdd,random_state=0)))
            XBal.append(tempX)
            if len(y.shape)>1:
                YBal.append(tempY[:,i])
            else:
                YBal.append(tempY)
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(delayed(_fit_binary)(
            self.estimator, XBal[i], YBal[i], classes=[
                "not %s" % self.label_binarizer_.classes_[i],
                self.label_binarizer_.classes_[i]])
            for i in range(len(YBal)))
            #for i, column in enumerate(columns))
        return self

### Linear SVM

In [40]:
from sklearn.svm import LinearSVC
svm_object = LinearSVC() 
# clf = CalibratedClassifierCV(svm_object) 
classifier = OneVsRestClassifierBalance(svm_object)

In [45]:
from sklearn.model_selection import cross_val_predict,cross_val_score
y_pred = cross_val_predict(classifier, features_combined.values, labels_matrix, cv=10)

In [43]:
 scores_f1 = cross_val_score(classifier, features_combined.values, labels_matrix, cv=10, scoring='f1_weighted').mean()

In [44]:
 scores_f1

0.6584836959272007

In [49]:
from sklearn.metrics import classification_report
# classification_report(labels_matrix, y_pred, digits=3)
print(classification_report(labels_matrix, y_pred,digits=3))

              precision    recall  f1-score   support

           0      0.474     0.646     0.546       610
           1      0.601     0.555     0.577       806
           2      0.775     0.817     0.796      2731
           3      0.608     0.638     0.623       185
           4      0.820     0.732     0.774       362
           5      0.425     0.365     0.393       945
           6      0.855     0.752     0.800       133
           7      0.116     0.057     0.077        87

   micro avg      0.657     0.666     0.662      5859
   macro avg      0.584     0.570     0.573      5859
weighted avg      0.653     0.666     0.657      5859
 samples avg      0.645     0.684     0.649      5859



### Grid Search for Linear_SVC

In [69]:
from sklearn.model_selection import GridSearchCV
svm_object = LinearSVC(max_iter =10000)
classifier = OneVsRestClassifierBalance(svm_object)
param_grid = {'estimator__C': [0.01, 0.1, 1, 10,50, 100]
              }

grid = GridSearchCV(classifier, param_grid=param_grid, cv=5,return_train_score=True, n_jobs =4)
grid.fit(X_train.values, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))


Best cross-validation accuracy: 0.60
Test set score: 0.62
Best parameters: {'estimator__C': 1}


In [71]:
y_pred =  grid.best_estimator_.predict(X_test)

In [72]:
print(classification_report(y_test, y_pred,digits=3))

              precision    recall  f1-score   support

           0      0.655     0.770     0.708       148
           1      0.610     0.545     0.576       209
           2      0.822     0.862     0.842       696
           3      0.574     0.587     0.581        46
           4      0.857     0.674     0.755        89
           5      0.550     0.527     0.538       239
           6      0.688     0.710     0.698        31
           7      0.267     0.286     0.276        14

   micro avg      0.719     0.725     0.722      1472
   macro avg      0.628     0.620     0.622      1472
weighted avg      0.717     0.725     0.719      1472
 samples avg      0.716     0.745     0.717      1472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Logistic regression

In [125]:
lr_object = LogisticRegression(max_iter =10000) 
classifier = OneVsRestClassifierBalance(lr_object)

In [126]:
y_pred = cross_val_predict(classifier, features_combined.values, labels_matrix, cv=10)
scores_f1 = cross_val_score(classifier, features_combined.values,\
                             labels_matrix, cv=10, scoring='f1_weighted').mean()

In [127]:
from sklearn.metrics import classification_report
# classification_report(labels_matrix, y_pred, digits=3)
print(classification_report(labels_matrix, y_pred,digits=3))

              precision    recall  f1-score   support

           0      0.612     0.543     0.575       610
           1      0.524     0.684     0.593       806
           2      0.731     0.906     0.809      2731
           3      0.492     0.708     0.581       185
           4      0.703     0.757     0.729       362
           5      0.417     0.521     0.463       945
           6      0.748     0.805     0.775       133
           7      0.096     0.057     0.072        87

   micro avg      0.623     0.745     0.678      5859
   macro avg      0.540     0.622     0.575      5859
weighted avg      0.621     0.745     0.675      5859
 samples avg      0.673     0.758     0.693      5859



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Logistic regression Grid Search

In [63]:
from sklearn.linear_model import LogisticRegression
lr_object = LogisticRegression(max_iter =10000)
# clf = CalibratedClassifierCV(svm_object) 
classifier = OneVsRestClassifierBalance(lr_object)

param_grid = {'estimator__C': [0.001, 0.01, 0.1, 1, 10, 100]
              }
              

grid_lr = GridSearchCV(classifier, param_grid=param_grid, cv=5,return_train_score=True, n_jobs =4)
grid_lr.fit(X_train.values, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid_lr.best_score_))
print("Test set score: {:.2f}".format(grid_lr.score(X_test, y_test)))
print("Best parameters: {}".format(grid_lr.best_params_))

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [66]:
print("Best cross-validation accuracy: {:.2f}".format(grid_lr.best_score_))
print("Test set score: {:.2f}".format(grid_lr.score(X_test, y_test)))

Best cross-validation accuracy: 0.61
Test set score: 0.62


In [65]:
print("Best parameters: {}".format(grid_lr.best_params_))

Best parameters: {'estimator__C': 10}


In [78]:
y_pred_lr =  grid_lr.best_estimator_.predict(X_test)

In [79]:
print(classification_report(y_test, y_pred_lr,digits=3))

              precision    recall  f1-score   support

           0      0.672     0.791     0.727       148
           1      0.630     0.603     0.616       209
           2      0.819     0.871     0.844       696
           3      0.556     0.652     0.600        46
           4      0.845     0.674     0.750        89
           5      0.549     0.536     0.542       239
           6      0.676     0.742     0.708        31
           7      0.200     0.286     0.235        14

   micro avg      0.717     0.743     0.730      1472
   macro avg      0.618     0.644     0.628      1472
weighted avg      0.718     0.743     0.729      1472
 samples avg      0.727     0.761     0.729      1472



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest

In [73]:
from sklearn.ensemble import RandomForestClassifier
rf_object = RandomForestClassifier() 
classifier = OneVsRestClassifierBalance(rf_object)

In [74]:

y_pred = cross_val_predict(classifier, features_combined.values, labels_matrix, cv=10, n_jobs=4)

In [75]:
scores_f1 = cross_val_score(classifier, features_combined.values, labels_matrix, cv=10, scoring='f1_weighted',n_jobs=4).mean()

In [76]:
scores_f1

0.6355205835542334

In [77]:
from sklearn.metrics import classification_report
# classification_report(labels_matrix, y_pred, digits=3)
print(classification_report(labels_matrix, y_pred,digits=3))

              precision    recall  f1-score   support

           0      0.452     0.675     0.542       610
           1      0.744     0.432     0.546       806
           2      0.772     0.814     0.792      2731
           3      0.802     0.416     0.548       185
           4      0.890     0.624     0.734       362
           5      0.541     0.276     0.366       945
           6      0.936     0.549     0.692       133
           7      0.089     0.425     0.147        87

   micro avg      0.655     0.624     0.639      5859
   macro avg      0.653     0.526     0.546      5859
weighted avg      0.699     0.624     0.640      5859
 samples avg      0.634     0.647     0.629      5859



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Original Data

In [128]:
df_original = pd.read_csv("dataTable.csv")
df_original.head()

Unnamed: 0.1,Unnamed: 0,file_id,section_id,url,heading_text,content_text_w_o_tags,abstracted_heading_plus_content,section_code
0,0,1,1,https://github.com/xiaobai557/wechat,Easy WeChat,可能是目前最优雅的微信公众平台 SDK 了。 > 不支持企业号，也不打算支持，原因？微信的...,@abstr_hyperlink 可能是目前最优雅的微信公众平台 SDK 了。 > 不支...,-
1,1,1,2,https://github.com/xiaobai557/wechat,特点,命名不那么乱七八糟； * 隐藏开发者不需要关注的细节； * 方法使用更优雅，不必再...,特点 命名不那么乱七八糟； * 隐藏开发者不需要关注的细节； * 方法使用更优雅...,-
2,2,1,3,https://github.com/xiaobai557/wechat,环境要求,@abstr_number . PHP >= @abstr_number . @abstr_...,环境要求 @abstr_number . PHP >= @abstr_number . @...,-
3,3,1,4,https://github.com/xiaobai557/wechat,安装,@abstr_code_section,安装 @abstr_code_section,-
4,4,1,5,https://github.com/xiaobai557/wechat,使用,基本使用（以服务端为例）: @abstr_code_section 更多请参考 @abs...,使用 基本使用（以服务端为例）: @abstr_code_section 更多请参考 ...,-


In [130]:
df_original = df_original.iloc[19:, :]

In [132]:
df_original.columns

Index(['Unnamed: 0', 'file_id', 'section_id', 'url', 'heading_text',
       'content_text_w_o_tags', 'abstracted_heading_plus_content',
       'section_code'],
      dtype='object')

### Derive statistical features

In [144]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', stop_words='english')
tfidfX = tfidf.fit_transform(df_original["abstracted_heading_plus_content"])
        


In [145]:
tfidfX.shape

(4835, 17625)

In [146]:
features_tfidf = pandas.DataFrame(tfidfX.todense())
        # Assign column names to make it easier to print most useful features later
features_tfidf.columns = tfidf.get_feature_names()


### Derive heuristic features

In [147]:
df_heuristics = derive_features_using_heuristics(df_original["url"],df_original["heading_text"] , df_original["content_text_w_o_tags"])

In [148]:
df_heuristics.head()

Unnamed: 0,heur_c_k_001,heur_c_k_002,heur_c_k_003,heur_c_k_004,heur_c_k_005,heur_c_k_006,heur_c_k_007,heur_h_k_001,heur_h_k_002,heur_h_k_003,...,heur_c_k_016,heur_c_k_017,heur_c_k_018,heur_c_k_019,heur_c_k_020,heur_c_k_021,heur_c_k_022,heur_c_k_023,heur_h_k_028,heur_h_k_029
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### combining the features

In [149]:
features_combined = pandas.concat([features_tfidf, df_heuristics], axis=1)

In [150]:
features_combined.head()

Unnamed: 0,32fbc617fdbf4085ec47f8d7847a7e1d,__,___,____,_____,______,_______________________________,__activerecord__,__any__,__apache_module__,...,heur_c_k_016,heur_c_k_017,heur_c_k_018,heur_c_k_019,heur_c_k_020,heur_c_k_021,heur_c_k_022,heur_c_k_023,heur_h_k_028,heur_h_k_029
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### label set

In [151]:
from sklearn.preprocessing import MultiLabelBinarizer
# Class '2' has been merged into class '1'
label_set = ['-','1','3','4','5','6','7','8']
labels = [str(x).split(',') for x in df_original['section_code']]
mlb = MultiLabelBinarizer(classes=label_set)
labels_matrix = mlb.fit_transform(labels)

In [152]:
# labels 

### split into train and test set

In [153]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_combined, \
                                                    labels_matrix, test_size = 0.25, random_state=42)

#### Linear_SVC()

In [154]:
from sklearn.svm import LinearSVC
svm_object = LinearSVC() 
# clf = CalibratedClassifierCV(svm_object) 
classifier = OneVsRestClassifierBalance(svm_object)


y_pred_svc = cross_val_predict(classifier, features_combined.values, labels_matrix, cv=10,n_jobs = 4)
scores_f1 = cross_val_score(classifier, features_combined.values, labels_matrix,\
                            cv=10, scoring='f1_weighted',n_jobs = 4).mean()
print("f1 score is: ",scores_f1)

# classification_report(labels_matrix, y_pred, digits=3)
print(classification_report(labels_matrix, y_pred_svc,digits=3))

f1 score is:  0.6787907245006773
              precision    recall  f1-score   support

           0      0.512     0.643     0.570       610
           1      0.606     0.565     0.585       734
           2      0.796     0.821     0.808      2467
           3      0.656     0.670     0.663       179
           4      0.819     0.733     0.774       322
           5      0.472     0.379     0.420       858
           6      0.855     0.770     0.810       122
           7      0.062     0.103     0.078        58

   micro avg      0.674     0.676     0.675      5350
   macro avg      0.597     0.586     0.589      5350
weighted avg      0.676     0.676     0.673      5350
 samples avg      0.662     0.696     0.664      5350



  _warn_prf(average, modifier, msg_start, len(result))


#### Random Forest Classifier

In [156]:
rf_object = RandomForestClassifier() 
# clf = CalibratedClassifierCV(svm_object) 
classifier = OneVsRestClassifierBalance(rf_object)


y_pred_rf = cross_val_predict(classifier, features_combined.values, labels_matrix, cv=10,n_jobs = 4)
scores_f1 = cross_val_score(classifier, features_combined.values,\
                            labels_matrix, cv=10, scoring='f1_weighted',n_jobs=4).mean()
print("f1 score is: ",scores_f1)

# classification_report(labels_matrix, y_pred, digits=3)
print(classification_report(labels_matrix, y_pred_rf,digits=3))

f1 score is:  0.6539402835762509
              precision    recall  f1-score   support

           0      0.477     0.693     0.566       610
           1      0.775     0.460     0.578       734
           2      0.793     0.807     0.800      2467
           3      0.810     0.285     0.421       179
           4      0.914     0.630     0.746       322
           5      0.588     0.318     0.413       858
           6      0.959     0.582     0.724       122
           7      0.063     0.431     0.110        58

   micro avg      0.668     0.631     0.649      5350
   macro avg      0.672     0.526     0.545      5350
weighted avg      0.725     0.631     0.656      5350
 samples avg      0.648     0.655     0.640      5350



  _warn_prf(average, modifier, msg_start, len(result))


#### Logistic Regression

In [None]:
lr_object = LogisticRegression(max_iter =10000)
# clf = CalibratedClassifierCV(svm_object) 
classifier = OneVsRestClassifierBalance(lr_object)


y_pred_lr = cross_val_predict(classifier, features_combined.values, labels_matrix, cv=10,n_jobs = 4)
scores_f1 = cross_val_score(classifier, features_combined.values,\
                            labels_matrix, cv=10, scoring='f1_weighted',n_jobs=4).mean()
print("f1 score is: ",scores_f1)

# classification_report(labels_matrix, y_pred, digits=3)
print(classification_report(labels_matrix, y_pred_lr,digits=3))

### Misclassified Records

In [202]:
X_test

Unnamed: 0,10,2gb,32fbc617fdbf4085ec47f8d7847a7e1d,3d,__,___,____,_____,______,_______________________________,...,heur_c_k_016,heur_c_k_017,heur_c_k_018,heur_c_k_019,heur_c_k_020,heur_c_k_021,heur_c_k_022,heur_c_k_023,heur_h_k_028,heur_h_k_029
3365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [203]:
df_X_test= pd.merge(df_all, X_test, left_index=True, right_index=True)

In [204]:
df_X_test["prediction"] = y_pred_lr.tolist()

In [205]:
df_X_test["actual_label"] = y_test.tolist()

In [206]:
df_X_test.columns

Index(['Unnamed: 0', 'file_id', 'section_id', 'url_x', 'heading_text',
       'content_text_w_o_tags', 'abstracted_heading_plus_content',
       'section_code', '10', '2gb',
       ...
       'heur_c_k_017', 'heur_c_k_018', 'heur_c_k_019', 'heur_c_k_020',
       'heur_c_k_021', 'heur_c_k_022', 'heur_c_k_023', 'heur_h_k_028',
       'heur_h_k_029', 'actual_label'],
      dtype='object', length=18769)

In [207]:
df_X_test = df_X_test[["file_id", "section_id","heading_text","content_text_w_o_tags","section_code","actual_label","prediction"]]

In [208]:
df_X_test.head()

Unnamed: 0,file_id,section_id,heading_text,content_text_w_o_tags,section_code,actual_label,prediction
8,3,28,3.3 ROM Loading,@abstr_image Symon can load any appropriately...,1,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0, 0]"
12,3,32,3.7 Breakpoints,@abstr_image Breakpoints can be set and remov...,1,"[0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 1, 0, 0]"
15,3,35,4.0 Usage,,3,"[0, 0, 0, 1, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 0]"
17,3,37,4.2 ROM images,The simulator requires a ROM image loaded into...,3,"[0, 0, 0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0]"
19,3,39,4.4 Running,"After loading a program or ROM image, clicking...",3,"[0, 0, 1, 0, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0, 0]"


In [209]:
df_miss_class= df_X_test[df_X_test["actual_label"] != df_X_test["prediction"]]

In [217]:
df_miss_class =df_miss_class.drop("actual_label", axis = 1)


In [218]:
df_miss_class

Unnamed: 0,file_id,section_id,heading_text,content_text_w_o_tags,section_code,prediction
8,3,28,3.3 ROM Loading,@abstr_image Symon can load any appropriately...,1,"[0, 0, 1, 0, 0, 0, 0, 0]"
12,3,32,3.7 Breakpoints,@abstr_image Breakpoints can be set and remov...,1,"[0, 0, 0, 0, 0, 1, 0, 0]"
15,3,35,4.0 Usage,,3,"[0, 0, 0, 1, 0, 0, 0, 0]"
17,3,37,4.2 ROM images,The simulator requires a ROM image loaded into...,3,"[0, 0, 0, 0, 1, 0, 0, 0]"
19,3,39,4.4 Running,"After loading a program or ROM image, clicking...",3,"[0, 0, 1, 0, 0, 0, 0, 0]"
23,3,43,8.0 Copyright and Acknowledgements,Copyright (c) @abstr_number Seth J. Morabito w...,5,"[0, 0, 1, 0, 0, 0, 0, 0]"
26,4,46,Objectives,@abstr_number . Learn about inheritance in obj...,11,"[0, 0, 1, 0, 0, 0, 0, 0]"
29,4,49,Code Along: Basic Inheritance,"In this domain model, we have class Vehicle th...",3,"[1, 0, 1, 0, 0, 0, 0, 0]"
33,4,53,Method Look-Up in Ruby,"How does our above example work? Well, when yo...",3,"[1, 0, 0, 0, 0, 0, 0, 0]"
43,5,63,PHP,@abstr_hyperlink (Zend Framework @abstr_numbe...,6,"[0, 0, 1, 0, 0, 0, 0, 0]"
