##  Data cleaning and processing
<hr>
Analysis, cleaning and processing of the original data files is done here

In [1]:
import pandas as pd
import numpy as np
import spacy

##  Load data
<hr>

In [2]:
en = spacy.load('en_core_web_sm')

df1 = pd.read_csv('data/original_data1.csv')
df2 = pd.read_csv('data/original_data2.csv')
df3 = pd.read_csv('data/original_data3.csv')
df4 = pd.read_csv('data/original_data4.csv')
df5 = pd.read_csv('data/original_data5.csv')

df = df1.append([df2, df3, df4, df5]).iloc[:, 1:]
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,id,title,body,tags,answers,score
0,62766758,"How to fix Python error ""...failed to map segm...",<p>I've recently started to use Google Cloud P...,python-3.x|pandas|shell|numpy|google-cloud-pla...,<p>Container-Optimized OS (COS) has several li...,1
1,62766758,"How to fix Python error ""...failed to map segm...",<p>I've recently started to use Google Cloud P...,python-3.x|pandas|shell|numpy|google-cloud-pla...,<p>I'm pretty sure that in case of Python libr...,0
2,62742938,Wrapper for 'python -m' command,<p>I have a package with following structure:<...,python|modulenotfounderror,"<p>The standard library has a module <a href=""...",1
3,62741826,How can I improve the speed of pandas rows ope...,<p>I have a large .csv file that has 11'000'00...,python|pandas|performance|data-science,<p>Hello and welcome to StackOverflow.</p>\n<p...,1
4,62741826,How can I improve the speed of pandas rows ope...,<p>I have a large .csv file that has 11'000'00...,python|pandas|performance|data-science,<p>I guess you want to <code>groupby</code> an...,0


In [3]:
print('Datebase shape:' + str(df.shape))

Datebase shape:(100000, 6)


##  Missing values check
<hr>

In [4]:
df.isna().sum()

id         0
title      0
body       0
tags       0
answers    0
score      0
dtype: int64

##  Text processing
<hr>

In [5]:
import re
import nltk
import inflect
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhavana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def tokenize(text):
    tokens = en.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]

In [7]:
print(tokenize("What is the difference between these two dataframes?"))

['what', 'is', 'the', 'difference', 'between', 'these', 'two', 'dataframes', '?']


In [8]:
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words 

In [9]:
print(remove_punctuation(tokenize("What is the difference between these two dataframes?")))

['what', 'is', 'the', 'difference', 'between', 'these', 'two', 'dataframes']


In [10]:
def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

In [11]:
print(remove_stopwords(remove_punctuation(tokenize("What is the difference between these two dataframes?"))))

['difference', 'two', 'dataframes']


In [12]:
def normalize(words):
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

In [42]:
i = 0

def process_text(text):
    global i
    i+=1
    print(i)
    return ' '.join(normalize(tokenize(text)))

In [41]:
print(process_text("What is the difference between these two dataframes?"))

1
difference two dataframes


## Database manipulation
<hr>
In the current database if a question has multiple answers then for each answer it is again listed as a seperate row. To simplify, all rows belonging to the same questions are merged. During the merge answers are concatenated and all the scores are summed up.

In [15]:
aggregations = {
    'answers' : lambda x : '\n'.join(x),
    'score' : 'sum'
}

df_grouped = df.groupby(['id','title','body','tags'], as_index=False).agg(aggregations)
dfm = pd.DataFrame(df_grouped)

In [16]:
dfm.head()

Unnamed: 0,id,title,body,tags,answers,score
0,21454,Specifying a mySQL ENUM in a Django model,<p>How do I go about specifying and using an E...,python|mysql|django|django-models|enums,"<p>From the <a href=""https://docs.djangoprojec...",205
1,22059,"How do content discovery engines, like Zemanta...",<p>I was wondering how as semantic service lik...,python|ruby|semantics|zemanta,<p>Michal Finkelstein from OpenCalais here.</p...,16
2,29856,Install Python to match directory layout in OS...,<p>The default Python install on OS X 10.5 is ...,python|macos|64-bit,"<p>Hyposaurus,</p>\n\n<p>It is possible to hav...",3
3,35569,Why does Python's iter() on a mapping return i...,<p>It seems like if you want to get the keys o...,python|mapping|iteration,"<p>Check out <a href=""http://mail.python.org/p...",11
4,39960,Javascript equivalent of Python's locals()?,<p>In Python one can get a dictionary of all l...,javascript|python,"<p>Well, I don't think that there is something...",23


In [17]:
str(dfm.shape)

'(58887, 6)'

##  Removing HTML tags and Feature engineering

In [18]:
! pip install bs4 textblob lxml



In [19]:
from bs4 import BeautifulSoup
from textblob import TextBlob
import lxml

title_list = []
content_list = []
answer_list = []

# sentiment analysis determines the emotion of the text
# positive/negetive/neutral
# sentiment function of textblob has two properties polarity and subjectivity

sentiment_polarity_list = []
sentiment_subjectivity_list = []

score_list = []
tag_list = []
corpus_list = []

for i, row in dfm.iterrows():
    title_list.append(row.title)
    tag_list.append(row.tags)
    
    content = row.body
    
    soup = BeautifulSoup(content, 'lxml')
    if soup.code: soup.code.decompose()     # Remove the code section
    tag_p = soup.p
    tag_pre = soup.pre
    text = ''
    if tag_p: text = text + tag_p.get_text()
    if tag_pre: text = text + tag_pre.get_text()
    
    content_list.append(row.title+' '+text)
    
    content = row.answers
    
    soup = BeautifulSoup(content, 'lxml')
    if soup.code: soup.code.decompose()     # Remove the code section
    tag_p = soup.p
    tag_pre = soup.pre
    text = ''
    if tag_p: text = text + tag_p.get_text()
    if tag_pre: text = text + tag_pre.get_text()
        
    answer_list.append(text)
    
    score_list.append(row.score) 
    
    corpus_list.append(content_list[-1]+' '+answer_list[-1])
    
    sentiment = TextBlob(text).sentiment
    sentiment_polarity_list.append(sentiment.polarity)
    sentiment_subjectivity_list.append(sentiment.subjectivity)

In [20]:
dff = pd.DataFrame({
    'title' : title_list,
    'corpus' : corpus_list,
    'content' : content_list,
    'tags' : tag_list,
    'score' : score_list,
    'answers' : answer_list,
    'polarity': sentiment_polarity_list,
    'subjectivity':sentiment_subjectivity_list
})

In [21]:
dff.head(10)

Unnamed: 0,title,corpus,content,tags,score,answers,polarity,subjectivity
0,Specifying a mySQL ENUM in a Django model,Specifying a mySQL ENUM in a Django model How ...,Specifying a mySQL ENUM in a Django model How ...,python|mysql|django|django-models|enums,205,From the Django documentation:,0.0,0.0
1,"How do content discovery engines, like Zemanta...","How do content discovery engines, like Zemanta...","How do content discovery engines, like Zemanta...",python|ruby|semantics|zemanta,16,Michal Finkelstein from OpenCalais here.,0.0,0.0
2,Install Python to match directory layout in OS...,Install Python to match directory layout in OS...,Install Python to match directory layout in OS...,python|macos|64-bit,3,"Hyposaurus,",0.0,0.0
3,Why does Python's iter() on a mapping return i...,Why does Python's iter() on a mapping return i...,Why does Python's iter() on a mapping return i...,python|mapping|iteration,11,Check out this thread for a discussion on the ...,-0.2,0.85
4,Javascript equivalent of Python's locals()?,Javascript equivalent of Python's locals()? In...,Javascript equivalent of Python's locals()? In...,javascript|python,23,"Well, I don't think that there is something li...",0.0,0.0
5,cx_Oracle: how do I get the ORA-xxxxx error nu...,cx_Oracle: how do I get the ORA-xxxxx error nu...,cx_Oracle: how do I get the ORA-xxxxx error nu...,python|oracle|cx-oracle,10,This results in the following output:,0.0,0.1
6,Open source alternative to MATLAB's fmincon fu...,Open source alternative to MATLAB's fmincon fu...,Open source alternative to MATLAB's fmincon fu...,python|matlab|numpy|numerical|scientific-compu...,73,For numerical optimization in Python you may t...,0.375,0.75
7,What's the best Django search app?,What's the best Django search app? I'm buildin...,What's the best Django search app? I'm buildin...,python|django|search|search-engine,179,I found Djoosh which relies on the pure-python...,0.0,0.1
8,What's a good way to find relative paths in Go...,What's a good way to find relative paths in Go...,What's a good way to find relative paths in Go...,python|google-app-engine,5,"The function returns an absolute path, use re...",0.05,0.325
9,Testing GUI code: should I use a mocking library?,Testing GUI code: should I use a mocking libra...,Testing GUI code: should I use a mocking libra...,python|unit-testing|user-interface|tdd,3,Please remember that TDD is not a panaceum. It...,-0.388889,0.694444


##  Data normalization

In [None]:
! pip install swifter

Collecting swifter
  Downloading swifter-1.0.7.tar.gz (633 kB)
Collecting psutil>=5.6.6
  Downloading psutil-5.8.0-cp38-cp38-win_amd64.whl (245 kB)
Collecting dask[dataframe]>=2.10.0
  Downloading dask-2021.3.0-py3-none-any.whl (925 kB)
Collecting ipywidgets>=7.0.0cloudpickle>=0.2.2
  Downloading ipywidgets-8.0.0a4-py3-none-any.whl (125 kB)
Collecting modin[ray]>=0.8.1.1
  Downloading modin-0.9.1-py3-none-win_amd64.whl (579 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-0.8.7-py3-none-any.whl (103 kB)
Collecting toolz>=0.8.2
  Downloading toolz-0.11.1-py3-none-any.whl (55 kB)
Collecting partd>=0.3.10
  Downloading partd-1.1.0-py3-none-any.whl (19 kB)
Collecting widgetsnbextension~=4.0a0
  Downloading widgetsnbextension-4.0.0a2-py3-none-any.whl (1.6 MB)
Collecting jupyterlab-widgets~=2.0a0
  Downloading jupyterlab_widgets-2.0.0a1-py3-none-any.whl (259 kB)
Collecting pandas>=1.0.0
  Downloading pandas-1.2.3-cp38-cp38-win_amd64.whl (9.3 MB)
Collecting numpy>=1.15.1
  Downloading numpy-

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\Bhavana\\AppData\\Local\\Programs\\Python\\Python38\\Lib\\site-packages\\~umpy\\.libs\\libopenblas.JPIJNSWNNAN3CE6LLI5FWSPHUT2VXMTH.gfortran-win_amd64.dll'
Consider using the `--user` option or check the permissions.



In [None]:
import spacy
import swifter
en = spacy.load('en_core_web_sm')

dff.content = dff.content.swifter.apply(lambda x: process_text(x))
#dff.corpus = dff.corpus.apply(lambda x: process_text(x))
#dff['processed_title'] = dff.title.apply(lambda x: process_text(x))

dff.score = (dff.score-dff.score.mean())/(dff.score.max()-dff.score.min())

dff.drop(['answers'], axis=1)
dff.drop(['content'], axis = 1)
dff.drop(['tags'], axis=1)

In [None]:
dff.to_csv('data/processed_data_model1.csv', index=False)