In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import os
import re
from tqdm import tqdm
from joblib import Parallel, delayed
import csv

In [2]:
def clean_mp_df(mp_df):
    # Clean dataset
    # Remove '.1.html' and '.1p.html' and other extensions from commands
    idx = 0
    for command in tqdm(mp_df['command']):
        if '.html' in command:
            clean_command = 'N/A'
            if '.1.html' in command:
                clean_command = mp_df['command'][idx].split('.1.html')[0] 
            elif '.1p.html' in command:
                clean_command = mp_df['command'][idx].split('.1p.html')[0]
            elif '.1posix.html' in command:
                clean_command = mp_df['command'][idx].split('.1posix.html')[0]
            elif '.1e.html' in command:
                clean_command = mp_df['command'][idx].split('.1e.html')[0]
            elif '.1sr.html' in command:
                clean_command = mp_df['command'][idx].split('.1sr.html')[0]
            elif '.1sr.html' in command:
                clean_command = mp_df['command'][idx].split('.1x.html')[0]
            mp_df['command'][idx] = clean_command
        idx+=1
    # Remove un-caught extra commands
    mp_df = mp_df[mp_df['command']!='N/A']
    # Remove duplicate descriptions & commands
    mp_df.drop_duplicates(inplace=True, subset='command')
    mp_df.drop(['description', 'flags'], axis=1, inplace=True)
    mp_df = mp_df.reset_index().drop(columns='index')
    return mp_df

# generate dataframe of Ubuntu commands/programs for which a man page exists
mp_df = clean_mp_df(pd.read_csv('manpages-db.csv')) 

100%|███████████████████████████████████| 36141/36141 [00:10<00:00, 3582.33it/s]


In [3]:
#Display raw dataframes
print("RAW MANPAGES DATAFRAME:")
print(mp_df)

RAW MANPAGES DATAFRAME:
                 command                                            options
0               0desktop                                                NaN
1               0install  <pre>\n       The first non-option argument to...
2                0launch  <pre>\n       0launch takes the same options a...
3      0store-secure-add                                                NaN
4                 0store  <pre>\n       <b>-h</b>, <b>--help</b>\n      ...
...                  ...                                                ...
34896               zzuf  <pre>\n   <b>Generic</b> <b>program</b> <b>inf...
34897           zzxorcat  <pre>\n       <u>zziplib-bin</u> tools accept ...
34898          zzxorcopy  <pre>\n       <u>zziplib-bin</u> tools accept ...
34899           zzxordir  <pre>\n       <u>zziplib-bin</u> tools accept ...
34900                %5b                                                NaN

[34901 rows x 2 columns]


In [4]:
def get_bold_segments(in_str):
    if pd.isnull(in_str):
        return []
    segments = []
    code_tag_flag = False # becomes true when opening code tag has been found
    split_str = re.split('<b>|</b>', in_str)
    for i in range(len(split_str)):
        if i%2 == 1 and split_str[i][0]=='-':
            split_str[i] = split_str[i].replace(',','')
            segments.append(split_str[i])
    extra_segments = []
    for idx in range(len(segments)):
        if '=' in segments[idx]:
            segments[idx] = segments[idx].split('=')[0]
        if '/' in segments[idx]:
            temp = segments[idx].split('/')
            segments[idx] = temp[0]
            for a in range(1, len(temp)):
                extra_segments.append(temp[a])
    for e in extra_segments:
        segments.append(e)
    return segments

# i = "<code><p>Star destroyers!</p></code><code>grep -f A B</code> <p>Four score</p> <code>I'm gonna cummm!</code> I am a menace <code>I FCUCKIG LOVE STARR WARSSSS! </code>"
# print(get_code_segments(i))

In [5]:
def find_flags(answer):
    code_segments = get_code_segments(answer)
    for c in mp_df['command']:
        # add spaces to avoid accidentally finding 'commands' within other words
        for segment in code_segments:
            #print(segment.split())
            for word in segment.split():
                if c == word:
                    return c
    return 'NOT_FOUND'
# i = "<code><p>Star destroyers!</p></code><code>grep -f A B</code><p>Four score</p> <code>I'm gonna cummm!</code> I am a menace <code>I FCUCKIG LOVE STARR WARSSSS! </code>"
# print(find_command(i))

In [6]:
# remove HTML tags from web-scraped questions
def remove_html_tags(dfd):
    # add 'command' column to data
    dfd['command'] = {}
    for i in range(len(dfd['question'])):
        dfd['command'][i]=(find_command(dfd['answer'][i]))
        q_temp = re.sub('<[^>]+>', '', dfd['question'][i])
        a_temp = re.sub('<[^>]+>', '', dfd['answer'][i])
        dfd['question'][i] = q_temp
        dfd['answer'][i] = a_temp
    return pd.DataFrame.from_dict(dfd)

In [7]:
mp_df['flags'] = [None]*len(mp_df['options'])
for idx in range(len(mp_df['options'])):
    mp_df['flags'][idx] = get_bold_segments(mp_df['options'][idx])
print(mp_df)

                 command                                            options  \
0               0desktop                                                NaN   
1               0install  <pre>\n       The first non-option argument to...   
2                0launch  <pre>\n       0launch takes the same options a...   
3      0store-secure-add                                                NaN   
4                 0store  <pre>\n       <b>-h</b>, <b>--help</b>\n      ...   
...                  ...                                                ...   
34896               zzuf  <pre>\n   <b>Generic</b> <b>program</b> <b>inf...   
34897           zzxorcat  <pre>\n       <u>zziplib-bin</u> tools accept ...   
34898          zzxorcopy  <pre>\n       <u>zziplib-bin</u> tools accept ...   
34899           zzxordir  <pre>\n       <u>zziplib-bin</u> tools accept ...   
34900                %5b                                                NaN   

                                                   

In [8]:
mp_df.to_pickle('manpages-db-clean.pkl')