## This document is made for data cleaning
#### and that's just what it'll do

In [0]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

stop_words = set(stopwords.words('english'))

In [0]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Load data
train_df = pd.read_csv('raw_train_data.csv')
train_labels = pd.read_csv('train_labels.csv')

test_df = pd.read_csv('raw_test_data.csv')

In [0]:
train_df['is_duplicate'] = train_labels['is_duplicate']

In [0]:
# drop empty items
train_df = train_df.dropna()

In [0]:
def clean_text(text):
    # To lowercase
    text = text.lower()
    
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # remove stopwords
    text = text.split()
    text = [w for w in text if not w in stop_words]
    text = ' '.join(text)
           
    # stem words
    text = text.split()
    stemmer = PorterStemmer()
    text = [stemmer.stem(word) for word in text]
    
    text = ' '.join(text)
    
    return text

In [0]:
train_df['question1'] = train_df['question1'].apply(clean_text)
train_df['question2'] = train_df['question2'].apply(clean_text)

test_df['question1'] = test_df['question1'].apply(clean_text)
test_df['question2'] = test_df['question2'].apply(clean_text)

In [0]:
# Get rid of the empty questions in the train set
train_df = train_df[(train_df.question1 != ' ') & (train_df.question2 != ' ')]
train_df = train_df[(train_df.question1 != '') & (train_df.question2 != '')]

In [0]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 323053 entries, 0 to 323163
Data columns (total 4 columns):
id              323053 non-null int64
question1       323053 non-null object
question2       323053 non-null object
is_duplicate    323053 non-null int64
dtypes: int64(2), object(2)
memory usage: 12.3+ MB


In [0]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81126 entries, 0 to 81125
Data columns (total 3 columns):
test_id      81126 non-null int64
question1    81126 non-null object
question2    81126 non-null object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


In [0]:
train_df.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,0,step step guid invest share market india,step step guid invest share market,0
1,1,stori kohinoor koh - - noor diamond,would happen indian govern stole kohinoor koh ...,0
2,2,increas speed internet connect use vpn,internet speed increas hack dn,0
3,3,mental lone solv,find remaind math 23 ^ 24 math divid 24 23,0
4,4,one dissolv water quikli sugar salt methan car...,fish would surviv salt water,0


In [0]:
test_df.head()

Unnamed: 0,test_id,question1,question2
0,15,would trump presid mean current intern master ...,trump presid affect student present us plan st...
1,20,rocket look white,rocket booster paint white
2,21,caus someon jealou,avoid jealou someon
3,23,much 30 kv hp,find convers chart cc horsepow
4,34,best travel websit spain,best travel websit


In [0]:
train_df.to_csv('cleaned_train_data.csv', index=False)
test_df.to_csv('cleaned_test_data.csv', index=False)