# A Deep Learning Domain Focused Language Model
## Workflow for Mathew Garland  
This Notebook must be executed as Trusted for the programmatically generated Markdown headings to display properly,
refer: [Jupyter Issue](https://github.com/jupyter/nbconvert/issues/145). 

If this Notebook is not executed as Trusted some programmatically generated headings will be displayed as: __<IPython.core.display.Markdown object>__.  

### Package Imports

In [1]:
import pandas as pd
# IPython is required for outputting text as Markdown
from IPython.display import display, Markdown
import re
import nltk

### Download nltk word tokenizer data

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize



[nltk_data] Downloading package punkt to /home/matgarland/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/matgarland/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Local Variables

In [3]:
# number of csv rows to display
DISPLAY_ROWS = 5
# file names of csv text data
IN_DATA_FILE = "stackexchange_812k.csv"
OUT_DATA_FILE = "Manning_Submission_1.4.csv"

### Workflow Step 1: Load the dataset into a pandas dataframe.

In [4]:
# import the csv file that holds our data
df = pd.read_csv( IN_DATA_FILE )
# Ensure the display of data wraps the DataFrame so all csv data in the cell is visible, i.e. no cell data truncation
pd.set_option('max_colwidth', 0)
pd.set_option('display.expand_frame_repr', False)

### Get the number of rows in the CSV data

In [5]:
# get the number of rows and columns in the original csv data
(rows, cols) = df.shape
# This Notebook must be executed as Trusted for Markdown headings to display properly refer: https://github.com/jupyter/nbconvert/issues/145
display( Markdown('### The data file, {}, consists of {} rows and {} columns.'.format( IN_DATA_FILE, rows, cols )) )
df['parent_id'].isna().sum()
df['post_id'].isna().sum()
df['comment_id'].isna().sum()
df['category'].value_counts(normalize=True)

### The data file, stackexchange_812k.csv, consists of 812132 rows and 5 columns.

comment    0.681017
post       0.206006
title      0.112977
Name: category, dtype: float64

### Example data structure

In [6]:
# display the first 5 rows of data in the csv file.
df.head(DISPLAY_ROWS)

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,1,,,Eliciting priors from experts,title
1,2,,,What is normality?,title
2,3,,,What are some valuable Statistical Analysis open source projects?,title
3,4,,,Assessing the significance of differences in distributions,title
4,6,,,The Two Cultures: statistics vs. machine learning?,title


### Data Wrangling

In [None]:
def tokenize(t):
    # word_tokenize() uses TreebankWordTokenizer internally
    return ' '.join( word_tokenize(t) )

def expand_contraction(t):
    # a dictionary of english contractions mapped to the corresponding expanded form
    # maybe not all of 'em, but certainly a good representation.
    dict_contraction = {
        'i\'d'      :   'I would',
        'i\'ll'     :   'I will',
        'i\'ve'     :   'I have',
        'i\'m'      :   'I am',
        'you\'d'    :   'you would',
        'you\'re'   :   'you are',
        'you\'ll'   :   'you will',
        'you\'ve'   :   'you have',
        'isn\'t'    :   'is not',
        'it\'s'     :   'it is',
        'it\'ll'    :   'it will',
        'it\'d'     :   'it had',
        'we\'d'     :   'we would',
        'we\'re'    :   'we are',
        'we\'ll'    :   'we will',
        'wasn\'t'   :   'was not',
        'weren\'t'  :   'were not',
        'aren\'t'   :   'are not',
        'they\'d'   :   'they would',
        'they\'ll'  :   'they will',
        'they\'re'  :   'they are',
        'they\'ve'  :   'they have',
        'let\'s'    :   'let us',
        'doesn\'t'  :   'does not',
        'can\'t'    :   'can not',
        'won\'t'    :   'will not',
        'wouldn\'t' :   'would not',
        'who\'s'    :   'who is',
        'what\'s'   :   'what is',
        'when\'s'   :   'when is',
        'where\'s'  :   'where is',
        'that\'s'   :   'that has',
        'that\'d'   :   'that would',
        'that\'ll'  :   'that will',
        'now\'s'    :   'now is',
        'how\'s'    :   'how is',
        'how\'ll'   :   'how will',
        'how\'d'    :   'how would'
    }
    # a dictionary of English contractions with mapping to expanded contractions
    # Expand all contractions in the csv text data
    for k,v in dict_contraction.items():
        t = re.sub( re.escape(k), v, str(t), 0, re.IGNORECASE | re.MULTILINE )
    return ''.join(t) 
 
# (1) replace latex expressions with a space character, some have double '$' hence used regex is a modified version of \$([^$]*)\$.
df['normalized']        = df['text'].replace(re.escape('\$+([^$]+)\$+'), ' ', regex=True )
# (2) expand all contractions using the local function, expand_contraction()
df['normalized']        = df['normalized'].apply(expand_contraction)

# (3) Remove rows containing any of the following regex's
# (3) Sequence: 
    # A) Remove rows containing <code> xyz blah ...</code>
    # B) Remove rows containing <pre> xyz blah ...</pre>
    # C) Replace \ or \\ or \\\ ... with blank
# Sequence A
#df['normalized']        = df['normalized'].replace(re.escape('<code>[^.]*</code>'), '', regex=True)
# Sequence B
#df['normalized']        = df['normalized'].replace(re.escape('<pre>[\s\S]*?<\/pre>'), '', regex=True)
# Sequence C

# (3) Remove rows containing any of the following regex's 
    # A) Remove rows containing <code> xyz blah ...</code>
    # B) Remove rows containing <pre> xyz blah ...</pre>
filter =                df['normalized'].str.contains( re.escape('(<code>[^.]*<\/code>)|(<pre>[\s\S]*?<\/pre>)'), flags=re.MULTILINE | re.IGNORECASE )
df = df[~filter]
# (4) Replace backslashes
df['normalized']        = df['normalized'].replace(re.escape('[\\]+'), '', regex=True)
# (5) Delete any @name symbol, optionally match @name[space][comma]
df['normalized']        = df['normalized'].replace(re.escape('@[\s]?[aA-zZ0-9.]+[\s:,]?'), '', regex=True)
# (6) Remove single quotes, double quotes or back ticks
df['normalized']        = df['normalized'].replace(re.escape('[\'\"\`]+'), '', regex=True)
# (7) Replace occurrences of * OR ** with space.
df['normalized']        = df['normalized'].replace(re.escape('(\*)+'), ' ', regex=True)
# (8) Remove any numbers optionally in parenthesis e.g 1, 1), (1,), or (1) (must trail markdown url removal)
df['normalized']        = df['normalized'].replace(re.escape('[+\(]?[\d][,0-9]?[\)]?'), '', regex=True)
# (9) Remove text span
df['normalized']        = df['normalized'].replace(re.escape('<span class="math\.container">[^<span]*<\/span>'), '', regex=True)
# (10) remove html5 node tags e.g. <p><bold>blah ...</bold><strong>blah blah ...</strong></strong></p>
df['normalized']        = df['normalized'].replace(re.escape('(\<(\/)?(\w)*(\d)?\>)'), '', regex=True )
# (11) Replace >= 2 whitespace characters with a single whitespace
df['normalized']        =df['normalized'].replace(re.escape( '[\s]{2,}'), ' ', regex = True )
# (12) Remove leading and trailing carriage returns and line feeds, if any
df['normalized']        =df['normalized'].replace(re.escape( '^[\r\n]+|\.|[\r\n]+$'), ' ', regex = True )
# (13) tokenize string data
df['tokenized']        = df['normalized'].apply(tokenize).str.lower()
# Drop the normalized column
df.drop(labels='normalized', axis=1, inplace=True)
# display data
df.head(DISPLAY_ROWS)
# Save wrangled data to a local csv data file
df.to_csv(OUT_DATA_FILE, index=False)
