#  Data cleaning and processing
<hr>
Analysis, cleaning and processing of the original data files is done here

In [4]:
import pandas as pd
import numpy as np
import spacy

##  Load data
<hr>

In [15]:
en = spacy.load('en_core_web_sm')

#read dataset

df1 = pd.read_csv('data/original_data1.csv')
df2 = pd.read_csv('data/original_data2.csv')
df3 = pd.read_csv('data/original_data3.csv')
df4 = pd.read_csv('data/original_data4.csv')
df5 = pd.read_csv('data/original_data5.csv')

df = df1.append([df2, df3, df4, df5]).iloc[:, 1:]
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,id,title,body,tags,answers,score
0,62766758,"How to fix Python error ""...failed to map segm...",<p>I've recently started to use Google Cloud P...,python-3.x|pandas|shell|numpy|google-cloud-pla...,<p>Container-Optimized OS (COS) has several li...,1
1,62766758,"How to fix Python error ""...failed to map segm...",<p>I've recently started to use Google Cloud P...,python-3.x|pandas|shell|numpy|google-cloud-pla...,<p>I'm pretty sure that in case of Python libr...,0
2,62742938,Wrapper for 'python -m' command,<p>I have a package with following structure:<...,python|modulenotfounderror,"<p>The standard library has a module <a href=""...",1
3,62741826,How can I improve the speed of pandas rows ope...,<p>I have a large .csv file that has 11'000'00...,python|pandas|performance|data-science,<p>Hello and welcome to StackOverflow.</p>\n<p...,1
4,62741826,How can I improve the speed of pandas rows ope...,<p>I have a large .csv file that has 11'000'00...,python|pandas|performance|data-science,<p>I guess you want to <code>groupby</code> an...,0


In [16]:
print('Datebase shape:' + str(df.shape))

Datebase shape:(100000, 6)


##  Missing values check
<hr>

In [17]:
df.isna().sum()

id         0
title      0
body       0
tags       0
answers    0
score      0
dtype: int64

##  Text processing
<hr>
Tokenization, lower the case, removing punctuations and stopwords

In [31]:
import re
import nltk
import inflect
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhavana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
#tokenizer

def tokenize(text):
    tokens = en.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]

In [33]:
print(tokenize("What is the difference between these two dataframes?"))

['what', 'is', 'the', 'difference', 'between', 'these', 'two', 'dataframes', '?']


In [34]:
#remove punctuations

def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words 

In [35]:
print(remove_punctuation(tokenize("What is the difference between these two dataframes?")))

['what', 'is', 'the', 'difference', 'between', 'these', 'two', 'dataframes']


In [36]:
#remove stop words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

In [37]:
print(remove_stopwords(remove_punctuation(tokenize("What is the difference between these two dataframes?"))))

['difference', 'two', 'dataframes']


In [38]:
def normalize(words):
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

In [41]:
def process_text(text):
    return ' '.join(normalize(tokenize(text)))

In [42]:
print(process_text("What is the difference between these two dataframes?"))

difference two dataframes


## Database manipulation
<hr>
In the current database if a question has multiple answers then for each answer it is again listed as a seperate row. To simplify, all rows belonging to the same questions are merged. During the merge answers are concatenated and all the scores are summed up.

In [15]:
aggregations = {
    'answers' : lambda x : '\n'.join(x),
    'score' : 'sum'
}

df_grouped = df.groupby(['id','title','body','tags'], as_index=False).agg(aggregations)
dfm = pd.DataFrame(df_grouped)

In [16]:
dfm.head()

Unnamed: 0,id,title,body,tags,answers,score
0,21454,Specifying a mySQL ENUM in a Django model,<p>How do I go about specifying and using an E...,python|mysql|django|django-models|enums,"<p>From the <a href=""https://docs.djangoprojec...",205
1,22059,"How do content discovery engines, like Zemanta...",<p>I was wondering how as semantic service lik...,python|ruby|semantics|zemanta,<p>Michal Finkelstein from OpenCalais here.</p...,16
2,29856,Install Python to match directory layout in OS...,<p>The default Python install on OS X 10.5 is ...,python|macos|64-bit,"<p>Hyposaurus,</p>\n\n<p>It is possible to hav...",3
3,35569,Why does Python's iter() on a mapping return i...,<p>It seems like if you want to get the keys o...,python|mapping|iteration,"<p>Check out <a href=""http://mail.python.org/p...",11
4,39960,Javascript equivalent of Python's locals()?,<p>In Python one can get a dictionary of all l...,javascript|python,"<p>Well, I don't think that there is something...",23


In [17]:
str(dfm.shape)

'(58887, 6)'

## Removing HTML tags and Feature engineering
 <hr>

In [18]:
from bs4 import BeautifulSoup
from textblob import TextBlob
import lxml

id_list = []

title_list = []
content_list = []
answer_list = []

# sentiment analysis determines the emotion of the text
# positive/negetive/neutral
# sentiment function of textblob has two properties polarity and subjectivity

sentiment_polarity_list = []
sentiment_subjectivity_list = []

score_list = []
corpus_list = []

for i, row in dfm.iterrows():
    
    id_list.append(row.id)
    title_list.append(row.title)
    
    content = row.body
    
    soup = BeautifulSoup(content, 'lxml')      #Beautiful soup with lxml parser
    if soup.code: soup.code.decompose()     # Remove the code section
    tag_p = soup.p
    tag_pre = soup.pre
    text = ''
    if tag_p: text = text + tag_p.get_text()
    if tag_pre: text = text + tag_pre.get_text()
    
    content_list.append(row.title+' '+text)
    
    content = row.answers
    
    soup = BeautifulSoup(content, 'lxml')      #Beautiful soup with lxml parser
    if soup.code: soup.code.decompose()     # Remove the code section
    tag_p = soup.p
    tag_pre = soup.pre
    text = ''
    if tag_p: text = text + tag_p.get_text()
    if tag_pre: text = text + tag_pre.get_text()
        
    answer_list.append(text)
    
    score_list.append(row.score) 
    
    #appending question title, body and answers to form the corpus column
    
    corpus_list.append(content_list[-1]+' '+answer_list[-1])
    
    #sentiment analysis
    
    sentiment = TextBlob(text).sentiment
    sentiment_polarity_list.append(sentiment.polarity)
    sentiment_subjectivity_list.append(sentiment.subjectivity)

In [19]:
dff = pd.DataFrame({
    'id' : id_list,
    'title' : title_list,
    'corpus' : corpus_list,
    'score' : score_list,
    'polarity': sentiment_polarity_list,
    'subjectivity':sentiment_subjectivity_list
})

dff.to_csv('data/feature_eng_data_model1.csv') #save the feature engineered dataset

In [20]:
dff.head(10)

Unnamed: 0,id,title,corpus,score,polarity,subjectivity
0,21454,Specifying a mySQL ENUM in a Django model,Specifying a mySQL ENUM in a Django model How ...,205,0.0,0.0
1,22059,"How do content discovery engines, like Zemanta...","How do content discovery engines, like Zemanta...",16,0.0,0.0
2,29856,Install Python to match directory layout in OS...,Install Python to match directory layout in OS...,3,0.0,0.0
3,35569,Why does Python's iter() on a mapping return i...,Why does Python's iter() on a mapping return i...,11,-0.2,0.85
4,39960,Javascript equivalent of Python's locals()?,Javascript equivalent of Python's locals()? In...,23,0.0,0.0
5,40586,cx_Oracle: how do I get the ORA-xxxxx error nu...,cx_Oracle: how do I get the ORA-xxxxx error nu...,10,0.0,0.1
6,49926,Open source alternative to MATLAB's fmincon fu...,Open source alternative to MATLAB's fmincon fu...,73,0.375,0.75
7,55056,What's the best Django search app?,What's the best Django search app? I'm buildin...,179,0.0,0.1
8,61894,What's a good way to find relative paths in Go...,What's a good way to find relative paths in Go...,5,0.05,0.325
9,79454,Testing GUI code: should I use a mocking library?,Testing GUI code: should I use a mocking libra...,3,-0.388889,0.694444


##  Data normalization
<hr>

In [21]:
import spacy
import pandas as pd
import numpy as np
import swifter
en = spacy.load('en_core_web_sm')

#apply the text processing functions

dff.corpus = dff.corpus.swifter.apply(lambda x: process_text(x))
dff['processed_title'] = dff.title.swifter.apply(lambda x: process_text(x))

dff.score = (dff.score-dff.score.mean())/(dff.score.max()-dff.score.min())

Pandas Apply:   0%|          | 0/58887 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/58887 [00:00<?, ?it/s]

In [23]:
dff.to_csv('data/processed_data_model1.csv', index=False)

In [23]:
df.id.is_unique

False

In [24]:
dff.id.is_unique

True