In [1]:
%pylab inline
import timeit
import pandas as pd
import re

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


Populating the interactive namespace from numpy and matplotlib


In [2]:
def read_data():
    
    data = pd.read_csv('../input/questions.csv', encoding="utf-8")
    print(data.groupby('is_duplicate').size())
    print(data.loc[0,:])
    return data

In [None]:
def clean_text(text):
    
    if text is None:
        return ''
    
    if not isinstance(text, basestring):
        print(text)
        print(type(text))
#        text = ''.join(text)
    
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text).encode('ascii', 'ignore')
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r";", "  ", text)
    text = re.sub(r"'", " ", text)
    
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)

    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)

    text = re.sub(r":", " : ", text)

    return text

def data_cleaning(data, stopwords_removal = False,  stem = False):   

    if stopwords_removal:
        stops = set(stopwords.words("english"))
    if stem:
        stemmer = SnowballStemmer('english')

    
    start = timeit.default_timer()
    f = open('data_cleaning.txt', 'w')        
    # tokenize data
    for row in data.iterrows():
 
        word_list = clean_text(row[1]['question1'])

        data.loc[row[0],'wordlist_1_clean'] = word_list
        data.loc[row[0],'len_q1_clean'] = len(word_list)

        
        # remove stopwords
        if stopwords_removal:
            try:
                word_list = word_list.lower().split()
                word_list = ' '.join([w for w in word_list if w not in stops])
            except:
                word_list = ''    
            data.loc[row[0],'wordlist_1_stopwords'] = word_list
            data.loc[row[0],'len_q1_stopwords'] = len(word_list)

        # stem the data
        if stem:
            try:
                word_list = word_list.lower().split()
                word_list = ' '.join([stemmer.stem(w) for w in word_list])
            except:
                word_list = ''
            data.loc[row[0],'wordlist_1_stem'] = word_list

            
        word_list = clean_text(row[1]['question2'])
        
        data.loc[row[0],'wordlist_2_clean'] = word_list
        data.loc[row[0],'len_q2_clean'] = len(word_list)



        # remove stopwords
        if stopwords_removal:
            try:
                word_list = word_list.lower().split()
                word_list = ' '.join([w for w in word_list if w not in stops])
            except:
                word_list = ''    
            data.loc[row[0],'wordlist_2_stopwords'] = word_list      
            data.loc[row[0],'len_q2_stopwords'] = len(word_list)


        # stem the data
        if stem:
            try:
                word_list = word_list.lower().split()
                word_list = ' '.join([stemmer.stem(w) for w in word_list])
            except:
                word_list = ''
            data.loc[row[0],'wordlist_2_stem'] = word_list
        
        f.write('Time to clean data (n = '+str(row[0])+') [s]= '+str(timeit.default_timer() - start)+'\n')            
    f.close()
        
    data.to_csv('../input/data_clean.csv', encoding="utf-8")
    
    print('data saved')
    
    return data


In [None]:
def main(n=100):
    
    start = timeit.default_timer()
    
    data = read_data()
    data = data.fillna('')
    
#    data = data.head(n=n)   
#    data = data[(data.id >= 105790) & (data.id <= 105890)]
    print(data.head(n=10))
    print(data.tail(n=10))
    
    current = timeit.default_timer()
    print('Time to read data (n = ',n,') [s]= ',current - start)
    
    data = data_cleaning(data, stopwords_removal = True, stem = True)
    current = timeit.default_timer()
    print('Time to clean data (n = ',n,') [s]= ',current - start)
    

main()

is_duplicate
0    255045
1    149306
dtype: int64
id                                                              0
qid1                                                            1
qid2                                                            2
question1       What is the step by step guide to invest in sh...
question2       What is the step by step guide to invest in sh...
is_duplicate                                                    0
Name: 0, dtype: object
   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   
5   5    11    12  Astrology: I am a Capricorn Sun Cap moon and c...   
6   6    13    14          