In [139]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import pandas as pd
import nltk, glob, os, re, string

# Load test into df
test = pd.read_csv('test_set/test.csv')

# Load multiple csv's and concatenate them into one dataframe. 
os.chdir('/Users/daniellee/Desktop/Kaggle/jobs_code_demo/data/stackoverflow_data/')
df = pd.concat(map(pd.read_csv, glob.glob("*.csv")))
df.index = range(len(df)); df.head()

Unnamed: 0,id,title,content,tags
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons


In [140]:
# Pre-processing and Cleaning

uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'


def stripTagsAndUris(x):
    """ Strip out HTML tags """
    
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""
    
    
def removePunctuation(x):
    """ Removes punctuation marks """
    
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)


stops = set(stopwords.words("english"))
def removeStopwords(x):
    """ Removes English stopwords """
    
    # Removing all the stopwords
    filtered_words = [word for word in x.split() if word not in stops]
    return " ".join(filtered_words)

# Map pre-processing functions onto columns in df and test dataframes. 
for cleanerDef in [stripTagsAndUris, removePunctuation, removeStopwords]:
    df['content'] = df['content'].map(cleanerDef)
    df['title'] = df['title'].map(cleanerDef)

    test['content'] = df['content'].map(cleanerDef)
    test['title'] = df['title'].map(cleanerDef)
    
# Remove NaN value
df = df.fillna('')
test = test.fillna('')

# Tokenize tags in the tags column    
df["tags"] = df["tags"].map(lambda x: x.split())

In [142]:
# Output to CSV

df.to_csv('cleaned/df_cleaned.csv')
test.to_csv('cleaned/test_cleaned.csv')