In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

#word processing packages
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import io

In [2]:
train = pd.read_csv("Quora_train.csv", encoding="utf-8")

In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
train.tail()

Unnamed: 0,qid,question_text,target
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0
1306121,ffffed09fedb5088744a,Who wins in a battle between a Wolverine and a...,0


In [5]:
train.shape

(1306122, 3)

In [6]:
train.nunique()

qid              1306122
question_text    1306122
target                 2
dtype: int64

In [7]:
train.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
qid              1306122 non-null object
question_text    1306122 non-null object
target           1306122 non-null int64
dtypes: int64(1), object(2)
memory usage: 29.9+ MB


In [9]:
# Creating a word count of text in new column freq
train['freq'] = train['question_text'].apply(lambda x: len(x.split()))
train.head()

Unnamed: 0,qid,question_text,target,freq
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,13
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,16
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,10
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,9
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,15


In [10]:
train_sample = train.sample(frac=0.01)

In [11]:
train_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13061 entries, 1244508 to 1015835
Data columns (total 4 columns):
qid              13061 non-null object
question_text    13061 non-null object
target           13061 non-null int64
freq             13061 non-null int64
dtypes: int64(2), object(2)
memory usage: 510.2+ KB


In [12]:
sentences = train_sample['question_text']

In [13]:
import re
# Lowercase, then replace any non-letter, space, or digit character in the sentences.
new_sentences = [re.sub(r'[^\w\s\d]','',h.lower()) for h in sentences]
# Replace sequences of whitespace with a space character.
new_sentences = [re.sub("\s+", " ", h) for h in new_sentences]

In [14]:
train_sample['new_sentences']=new_sentences

In [15]:
train_sample.head()

Unnamed: 0,qid,question_text,target,freq,new_sentences
1244508,f3e1ed25bedb236d6c69,Why are so many Indian citizens settling in th...,0,10,why are so many indian citizens settling in th...
1289653,fcc18992f06ce38efb8d,Is mosquito really important in our ecosystem?,0,7,is mosquito really important in our ecosystem
408629,50121955e9d87ff128aa,Is there anything called as forensic dentistry?,0,7,is there anything called as forensic dentistry
940450,b84f889e8df24a359b63,Is it bad if one of my breasts is bigger than ...,0,15,is it bad if one of my breasts is bigger than ...
909101,b222019c54f87dc0b004,Why is Jimmy Fallon music room so small?,0,8,why is jimmy fallon music room so small


In [16]:
stop_words = set(stopwords.words('English'))

In [17]:
train_sample['new_sentences'] = train_sample['new_sentences'].apply(lambda x: [item for item in str.split(x) if item not in stop_words])

In [18]:
train_sample.head()

Unnamed: 0,qid,question_text,target,freq,new_sentences
1244508,f3e1ed25bedb236d6c69,Why are so many Indian citizens settling in th...,0,10,"[many, indian, citizens, settling, uk]"
1289653,fcc18992f06ce38efb8d,Is mosquito really important in our ecosystem?,0,7,"[mosquito, really, important, ecosystem]"
408629,50121955e9d87ff128aa,Is there anything called as forensic dentistry?,0,7,"[anything, called, forensic, dentistry]"
940450,b84f889e8df24a359b63,Is it bad if one of my breasts is bigger than ...,0,15,"[bad, one, breasts, bigger, im, male]"
909101,b222019c54f87dc0b004,Why is Jimmy Fallon music room so small?,0,8,"[jimmy, fallon, music, room, small]"


In [19]:
train_sample['new_freq'] = train_sample['new_sentences'].apply(lambda x: len(x))
train_sample.head()

Unnamed: 0,qid,question_text,target,freq,new_sentences,new_freq
1244508,f3e1ed25bedb236d6c69,Why are so many Indian citizens settling in th...,0,10,"[many, indian, citizens, settling, uk]",5
1289653,fcc18992f06ce38efb8d,Is mosquito really important in our ecosystem?,0,7,"[mosquito, really, important, ecosystem]",4
408629,50121955e9d87ff128aa,Is there anything called as forensic dentistry?,0,7,"[anything, called, forensic, dentistry]",4
940450,b84f889e8df24a359b63,Is it bad if one of my breasts is bigger than ...,0,15,"[bad, one, breasts, bigger, im, male]",6
909101,b222019c54f87dc0b004,Why is Jimmy Fallon music room so small?,0,8,"[jimmy, fallon, music, room, small]",5
