### Clean Stack Exchange post text

In [25]:
# Import packages.
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import wordnet

In [44]:
def tokenize_text(doc):
    """Combine the strings in the "response" column of dataframe df into one long string. Then, tokenize the
    string and make all words lowercase."""
    
    # Combine the col into a string.
    #doc = ' '.join(map(str,df[col].tolist()))

    # Tokenize and make lowercase.
    words = nltk.word_tokenize(doc)
    words = [w.lower() for w in words]
    
    return words


def wordnet_pos(tag):
    """Map a Brown POS tag to a WordNet POS tag."""
    
    table = {"N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV, "J": wordnet.ADJ}
    
    # Default to a noun.
    return table.get(tag[0], wordnet.NOUN)


def lemmatize_text(words):
    """Lemmatize words to get the base words. The input 'words' is a list of of words."""
    
    lemmatizer = nltk.WordNetLemmatizer()
    word_tags = nltk.pos_tag(words)
    words = [lemmatizer.lemmatize(w, wordnet_pos(t)) for (w, t) in word_tags]
    
    return words


def remove_stopwords(words):
    """Remove stopwords from a string."""
    
    stopwords = nltk.corpus.stopwords.words("english")
    words = [w for w in words if w not in stopwords]
    
    return words

In [97]:
def clean_text(doc):    
    # Tokenize, lemmatize, and remove stopwords for the text of all articles.
    words = re.sub("< ?/?[a-z]+ ?>|\n", "", doc)
    words = tokenize_text(words)
    words = lemmatize_text(words)
    words = remove_stopwords(words)
    doc = [w for w in words if w.isalnum()]
    doc = ' '.join(doc)
    
    return doc

In [76]:
test = pd.read_csv("/Users/christinachang/Documents/STA141C/sta-141c-classify/data/biology.csv")

In [81]:
test['text'] = test['title'] + " " + test['content']

In [103]:
test['text'][1000]

"Text/resource with information on all skeletal muscles and their motor units <p>Something analogous to an encyclopedia on baseball players with a list of all thier stats would be ideal. </p>\n\n<p>I'm not looking for just generic muscle names, locations and illustrations.</p>\n\n<p>Good answers would include links to resources, textbooks etc. which include stats for most or all of the following of every skeletal muscle in the human anatomy:</p>\n\n<ul>\n<li>Language of origin/LOO definition</li>\n<li>Labeled by ? shape, location, size etc.</li>\n<li>Type of Shape (i.e., fusiform, pennate, bipennate, multipennate, convergent, parrellel etc)</li>\n<li>Angle(s) of pennation (if applicable)</li>\n<li># of motor neuron innervations</li>\n<li># of proprioceptors in muscle</li>\n<li># of fibers</li>\n<li>Length of muscle</li>\n<li>Length of fibers</li>\n<li>Typical fuel system reliance (determined off of biomechanical placement and architecture..CPr, oxidative, etc.)</li>\n<li>Biomechanical 

In [102]:
clean_text(test['text'][1000])

'information skeletal muscle motor unit something analogous encyclopedia baseball player list thier stats would ideal look generic muscle name location answer would include link resource textbook etc include stats following every skeletal muscle human anatomy language definitionlabeled shape location size shape fusiform pennate bipennate multipennate convergent parrellel etc angle pennation applicable motor neuron innervation proprioceptor muscle fiberslength musclelength fiberstypical fuel system reliance determine biomechanical placement oxidative etc biomechanical function structure correspond anchor correspond movement manipulation'

In [70]:
test = "rnase contamination rna base experiment prevent ? < p > anyone suggestion prevent rnase contamination work' rna ? < /p > < p > tend issue degradation regardless whether use depc treat / rnase free water filter pipette tips. < /p >\n"
re.sub("< ?/?[a-z]+ ?>|\n", "", test)



"rnase contamination rna base experiment prevent ?  anyone suggestion prevent rnase contamination work' rna ?   tend issue degradation regardless whether use depc treat / rnase free water filter pipette tips. "