## Data cleaning and processing
<hr>
Analysis, cleaning and processing of the original data files is done here

In [4]:
import pandas as pd
import numpy as np
import spacy

## Load data
<hr>

In [15]:
en = spacy.load('en_core_web_sm')

df1 = pd.read_csv('data/original_data1.csv')
df2 = pd.read_csv('data/original_data2.csv')
df3 = pd.read_csv('data/original_data3.csv')
df4 = pd.read_csv('data/original_data4.csv')
df5 = pd.read_csv('data/original_data5.csv')

df = df1.append([df2, df3, df4, df5]).iloc[:, 1:]
df.head()

Unnamed: 0,id,title,body,tags,answers,score
0,62766758,"How to fix Python error ""...failed to map segm...",<p>I've recently started to use Google Cloud P...,python-3.x|pandas|shell|numpy|google-cloud-pla...,<p>Container-Optimized OS (COS) has several li...,1
1,62766758,"How to fix Python error ""...failed to map segm...",<p>I've recently started to use Google Cloud P...,python-3.x|pandas|shell|numpy|google-cloud-pla...,<p>I'm pretty sure that in case of Python libr...,0
2,62742938,Wrapper for 'python -m' command,<p>I have a package with following structure:<...,python|modulenotfounderror,"<p>The standard library has a module <a href=""...",1
3,62741826,How can I improve the speed of pandas rows ope...,<p>I have a large .csv file that has 11'000'00...,python|pandas|performance|data-science,<p>Hello and welcome to StackOverflow.</p>\n<p...,1
4,62741826,How can I improve the speed of pandas rows ope...,<p>I have a large .csv file that has 11'000'00...,python|pandas|performance|data-science,<p>I guess you want to <code>groupby</code> an...,0


In [16]:
print('Datebase shape:' + str(df.shape))

Datebase shape:(100000, 6)


## Missing Values

In [17]:
df.isna().sum()

id         0
title      0
body       0
tags       0
answers    0
score      0
dtype: int64

## Text processing
<hr>

In [31]:
import re
import nltk
import inflect
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhavana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
def tokenize(text):
    tokens = en.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]

In [33]:
print(tokenize("What is the difference between these two dataframes?"))

['what', 'is', 'the', 'difference', 'between', 'these', 'two', 'dataframes', '?']


In [34]:
def remove_punctuation(words):
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words 

In [35]:
print(remove_punctuation(tokenize("What is the difference between these two dataframes?")))

['what', 'is', 'the', 'difference', 'between', 'these', 'two', 'dataframes']


In [36]:
def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

In [37]:
print(remove_stopwords(remove_punctuation(tokenize("What is the difference between these two dataframes?"))))

['difference', 'two', 'dataframes']


In [38]:
def normalize(words):
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

In [41]:
def process_text(text):
    return ' '.join(normalize(tokenize(text)))

In [42]:
print(process_text("What is the difference between these two dataframes?"))

difference two dataframes
