# Stack Data

In [2]:
import xml.etree.ElementTree as ET
import sys, os
import pandas as pd
import numpy as np

def xml2pd(xml_file):
    ''' Load xml file as Pandas DataFrame'''
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    cols = root[0].attrib.keys()
    rows = [child.attrib for child in root]
    return pd.DataFrame(rows, columns = cols)

## NLP

In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup
import re, string

In [25]:
stopWords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def removeHTML(text):
    ''' Remove HTML from given text '''
    soup = BeautifulSoup(text, 'lxml')
    return soup.get_text()

def removeURLs(text):
    ''' Remove URLs from given text '''
    return re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)

def removePunctuation(text):
    ''' Remove punctuations from given text'''
    return "".join([c for c in text if c not in string.punctuation])

def removeStopWords(text):
    ''' Remove stopwords from given text '''
    return [word for word in text if word not in stopWords]

def lemmatizeText(text):
    ''' Lemmatize the given text '''
    return [lemmatizer.lemmatize(i) for i in text]

In [6]:
posts = xml2pd('Posts.xml')

In [16]:
removePunctuation(posts['Body'][5])

'pIve heard that the quality of a monte carlo ray tracer based on path tracing algorithms is much more realistic than a distributed stochastic engine I try to understand why but Im just at the beginning p\n\npIn order to dive into this topic and understand the basics can someone point me into the right direction What part of the algorithm leads into a more realistic render result p\n'

## Analysis with tokens

In [19]:
dfmini = pd.DataFrame(columns=["body", "points", "postType"])
tokenizer = RegexpTokenizer(r'\w+')

dfmini["body"] = posts['Body'].apply(lambda x: tokenizer.tokenize(x.lower()))
dfmini["points"] = posts['Score']
dfmini["postType"] = posts["PostTypeId"]

# PostTypeId 1 = Question, 2 = Answer
dfmini.head(20)

Unnamed: 0,body,points,postType
0,"[p, i, saw, a, a, href, https, www, youtube, c...",8,1
1,"[p, the, convention, in, graphics, is, that, p...",11,1
2,"[p, sometimes, i, use, vector, graphics, simpl...",20,1
3,"[p, i, just, implemented, some, interpolated, ...",11,1
4,"[p, i, have, an, opengl, application, which, u...",11,1
5,"[p, i, ve, heard, that, the, quality, of, a, m...",30,1
6,"[p, there, might, be, p, p, less, technical, a...",8,2
7,"[p, a, few, years, ago, i, tried, to, implemen...",17,1
8,"[p, as, thebuzzsaw, said, it, does, depend, on...",15,2
9,"[p, updating, an, area, of, memory, in, the, g...",11,2


## StopWord Removal

In [24]:
dfmini['body'] = dfmini['body'].apply(lambda x: removeStopWords(x))
dfmini['body'].head(20)

0     [p, saw, href, https, www, youtube, com, watch...
1     [p, convention, graphics, performing, fewer, s...
2     [p, sometimes, use, vector, graphics, simply, ...
3     [p, implemented, interpolated, texture, sampli...
4     [p, opengl, application, uses, stencil, tests,...
5     [p, heard, quality, monte, carlo, ray, tracer,...
6     [p, might, p, p, less, technical, answer, p, p...
7     [p, years, ago, tried, implement, href, http, ...
8     [p, thebuzzsaw, said, depend, lots, things, in...
9     [p, updating, area, memory, graphics, device, ...
10    [p, wikipedia, href, https, en, wikipedia, org...
11    [p, strong, distributed, ray, tracing, strong,...
12    [p, really, way, p, p, geometry, shader, invoc...
13    [p, 1978, edwin, catmull, jim, clark, defined,...
14    [blockquote, p, stencil, buffer, contains, per...
15    [p, term, distributed, ray, tracing, originall...
16    [p, href, https, en, wikipedia, org, wiki, per...
17    [p, stencil, buffer, unsigned, integer, bu

## Lemmatize

In [31]:
dfmini['body'] = dfmini['body'].apply(lambda x: lemmatizeText(x))

In [32]:
dfmini['body'].head()

0    [p, saw, href, http, www, youtube, com, watch,...
1    [p, convention, graphic, performing, fewer, st...
2    [p, sometimes, use, vector, graphic, simply, l...
3    [p, implemented, interpolated, texture, sampli...
4    [p, opengl, application, us, stencil, test, qu...
Name: body, dtype: object

## TF-IDF

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = list(set(posts['Body']))
for idx in range(len(data)):
    data[idx] = removeHTML(data[idx])

tfid_vectorizer = TfidfVectorizer(stopwords)
tfid_vectorizer.fit_transform(data)
features = tfid_vectorizer.get_feature_names()

In [46]:
features[:10]

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '000000000',
 '00000000001',
 '0000001192092896',
 '0000001f',
 '000001']