# A Deep Learning Domain Focused Language Model
## Workflow for Mathew Garland  
This Notebook must be Trusted for the programmatically generated Markdown headings to display properly,
refer: [Jupyter Issue](https://github.com/jupyter/nbconvert/issues/145). 

If this Notebook is not executed as Trusted some programmatically generated headings may be displayed as: __<IPython.core.display.Markdown object>__.  

### Package Imports

In [31]:
import pandas as pd
# IPython is required for outputting text as Markdown
from IPython.display import display, Markdown
import re
import nltk

### Download nltk word tokenizer data

In [32]:
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize



[nltk_data] Downloading package punkt to /home/matgarland/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/matgarland/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Local Variables

In [33]:
# number of csv rows to display
DISPLAY_ROWS = 5
# file names of csv text data
IN_DATA_FILE = "stackexchange_812k.csv"
OUT_DATA_FILE = "test_out.csv" # "Manning_Submission_1.4.csv"

### Workflow Step 1: Load the dataset into a pandas dataframe.

In [34]:
# import the csv file that holds our data
df = pd.read_csv( IN_DATA_FILE )
# Ensure the display of data wraps the DataFrame so all csv data in the cell is visible, i.e. no cell data truncation
pd.set_option('max_colwidth', 0)
pd.set_option('display.expand_frame_repr', False)

### Get the data dimensionality of the CSV data

In [35]:
# get the number of rows and columns in the original csv data
(rows, cols) = df.shape
display( Markdown( '### Data file: {}, consists of {} rows and {} columns.'.format( IN_DATA_FILE, rows, cols ) ) )
df['parent_id'].isna().sum()
df['post_id'].isna().sum()
df['comment_id'].isna().sum()

df['category'].value_counts(normalize=True)

### Data file: stackexchange_812k.csv, consists of 812132 rows and 5 columns.

comment    0.681017
post       0.206006
title      0.112977
Name: category, dtype: float64

### CSV data structure

In [36]:
# display the first 5 rows of data in the csv file.
df.head(DISPLAY_ROWS)

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,1,,,Eliciting priors from experts,title
1,2,,,What is normality?,title
2,3,,,What are some valuable Statistical Analysis open source projects?,title
3,4,,,Assessing the significance of differences in distributions,title
4,6,,,The Two Cultures: statistics vs. machine learning?,title


### Data Wrangling

In [37]:
# tokenize(t)
# Tokenize the text
def tokenize(t):
    # word_tokenize() uses TreebankWordTokenizer internally
    return ' '.join( word_tokenize(t) )

# expand_contractions(t)
# A function to expand all of the english contractions into their expanded form
def expand_contraction(t):
    # a dictionary of english contractions mapped to the corresponding expanded form
    # possibly not all of them, but certainly a good representation.
    dict_contraction = {
        'i\'d'      :   'I would',
        'i\'ll'     :   'I will',
        'i\'ve'     :   'I have',
        'i\'m'      :   'I am',
        'you\'d'    :   'you would',
        'you\'re'   :   'you are',
        'you\'ll'   :   'you will',
        'you\'ve'   :   'you have',
        'isn\'t'    :   'is not',
        'it\'s'     :   'it is',
        'it\'ll'    :   'it will',
        'it\'d'     :   'it had',
        'we\'d'     :   'we would',
        'we\'re'    :   'we are',
        'we\'ll'    :   'we will',
        'wasn\'t'   :   'was not',
        'weren\'t'  :   'were not',
        'aren\'t'   :   'are not',
        'they\'d'   :   'they would',
        'they\'ll'  :   'they will',
        'they\'re'  :   'they are',
        'they\'ve'  :   'they have',
        'let\'s'    :   'let us',
        'doesn\'t'  :   'does not',
        'can\'t'    :   'can not',
        'won\'t'    :   'will not',
        'wouldn\'t' :   'would not',
        'who\'s'    :   'who is',
        'what\'s'   :   'what is',
        'when\'s'   :   'when is',
        'where\'s'  :   'where is',
        'that\'s'   :   'that has',
        'that\'d'   :   'that would',
        'that\'ll'  :   'that will',
        'now\'s'    :   'now is',
        'how\'s'    :   'how is',
        'how\'ll'   :   'how will',
        'how\'d'    :   'how would'
    }
    # Expand all contractions into their corresponding expanded form
    for k,v in dict_contraction.items():
        t = re.sub( re.escape(k), v, str(t), 0, re.IGNORECASE | re.MULTILINE )
    return ''.join(t) 
# ****************************************************************************
# Regular Expressions and function application

# (1) expand all contractions using the local function, expand_contraction()
df['normalized']        = df['text'].apply( expand_contraction )

# (2) Remove lines containing Latex expressions, some latex expressions have double '$' hence used regex is a modified version of \$([^$]*)\$.
remove_rows = df['normalized'].str.contains( '\$+([^$]+)\$+', flags = re.MULTILINE | re.IGNORECASE, regex = True )
df = df[~remove_rows]

# (3) remove data from within the row, found between any of the patterns contained below 
  # A) Remove rows containing <code> xyz blah ...</code>
  # B) Remove rows containing <pre> xyz blah ...</pre>
df['normalized'] = df['normalized'].replace( to_replace = '(<code>[^.]*<\/code>)|(<pre>[\s\S]*?<\/pre>)', value = ' ', flags = re.MULTILINE | re.IGNORECASE, regex = True )

# (4) Delete any @name symbol from within the csv row data, optionally match @name[space| ,| :]
df['normalized'] = df['normalized'].replace( to_replace = '@[\s]?[aA-zZ0-9.]+[\s:,]?', value = ' ', flags = re.MULTILINE | re.IGNORECASE, regex = True )

# (5) Replace backslashes
df['normalized'] = df['normalized'].replace( to_replace = '[\\]+', value = ' ', flags = re.MULTILINE | re.IGNORECASE, regex = True )        

# (6) Remove single quotes, double quotes or back ticks
df['normalized'] = df['normalized'].replace( to_replace = '[\'\"\`]+', value = '', flags = re.MULTILINE | re.IGNORECASE, regex=True )

# (7) Replace occurrences of * OR ** with space
df['normalized'] = df['normalized'].replace( to_replace = '(\*)+', value = ' ', flags = re.MULTILINE | re.IGNORECASE, regex=True )

# (8) Remove any numbers optionally in parenthesis e.g 1, 1), (1,), or (1) (must trail markdown url removal)
df['normalized'] = df['normalized'].replace( to_replace = '[+\(]?[\d][,0-9]?[\)]?', value = '', flags = re.MULTILINE | re.IGNORECASE, regex = True )

# (9) Remove text span
df['normalized'] = df['normalized'].replace( to_replace = '<span class="math\.container">[^<span]*<\/span>', value = '', flags = re.MULTILINE | re.IGNORECASE, regex = True )

# (10) remove html5 node tags e.g. <p><bold>blah ...</bold><strong>blah blah ...</strong></strong></p>
df['normalized'] = df['normalized'].replace( to_replace = '(\<(\/)?(\w)*(\d)?\>)', value = '', flags = re.MULTILINE | re.IGNORECASE, regex = True )

# (11) Replace >= 2 whitespace characters with a single whitespace
df['normalized'] = df['normalized'].replace( to_replace = '[\s]{2,}', value = ' ', flags = re.MULTILINE | re.IGNORECASE, regex = True )

# (12) Remove leading and trailing carriage returns and line feeds, if any
df['normalized'] = df['normalized'].replace( to_replace = '^[\r\n]+|\.|[\r\n]+$', value = ' ', flags = re.MULTILINE | re.IGNORECASE, regex = True )

# (13) tokenize string data using the nltk word tokenizer, including applying lowercase to all text, As 'Tokenized' column data
df['tokenized'] = df['normalized'].apply( tokenize ).str.lower()

# Drop the normalized column
df.drop( labels = 'normalized', axis = 1, inplace = True)

  return func(self, *args, **kwargs)


### Resulting csv data structure

In [38]:
# display wrangled data
df.head(DISPLAY_ROWS)

Unnamed: 0,post_id,parent_id,comment_id,text,category,normalized
0,1,,,Eliciting priors from experts,title,Eliciting priors from experts
1,2,,,What is normality?,title,What is normality?
2,3,,,What are some valuable Statistical Analysis open source projects?,title,What are some valuable Statistical Analysis open source projects?
3,4,,,Assessing the significance of differences in distributions,title,Assessing the significance of differences in distributions
4,6,,,The Two Cultures: statistics vs. machine learning?,title,The Two Cultures: statistics vs. machine learning?


### Write the data structure out to a new data file.

In [39]:
# Save wrangled data to a local csv data file
df.to_csv( OUT_DATA_FILE, index=False )