In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

#word processing packages
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import io

In [2]:
train = pd.read_csv("Quora_train.csv", encoding="utf-8")

In [39]:
test = pd.read_csv("Quora_test.csv", encoding="utf-8")

In [40]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56370 entries, 0 to 56369
Data columns (total 2 columns):
qid              56370 non-null object
question_text    56370 non-null object
dtypes: object(2)
memory usage: 880.9+ KB


In [41]:
test.head()

Unnamed: 0,qid,question_text
0,00014894849d00ba98a9,My voice range is A2-C5. My chest voice goes u...
1,000156468431f09b3cae,How much does a tutor earn in Bangalore?
2,000227734433360e1aae,What are the best made pocket knives under $20...
3,0005e06fbe3045bd2a92,Why would they add a hypothetical scenario tha...
4,00068a0f7f41f50fc399,What is the dresscode for Techmahindra freshers?


In [3]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
train.tail()

Unnamed: 0,qid,question_text,target
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0
1306121,ffffed09fedb5088744a,Who wins in a battle between a Wolverine and a...,0


In [5]:
train.shape

(1306122, 3)

In [6]:
train.nunique()

qid              1306122
question_text    1306122
target                 2
dtype: int64

In [7]:
train.target.value_counts()

0    1225312
1      80810
Name: target, dtype: int64

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
qid              1306122 non-null object
question_text    1306122 non-null object
target           1306122 non-null int64
dtypes: int64(1), object(2)
memory usage: 29.9+ MB


In [9]:
# Creating a word count of text in new column freq
train['freq'] = train['question_text'].apply(lambda x: len(x.split()))
train.head()

Unnamed: 0,qid,question_text,target,freq
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,13
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,16
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,10
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,9
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,15


In [10]:
train_sample = train.sample(frac=0.01)

In [11]:
train_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13061 entries, 587359 to 320733
Data columns (total 4 columns):
qid              13061 non-null object
question_text    13061 non-null object
target           13061 non-null int64
freq             13061 non-null int64
dtypes: int64(2), object(2)
memory usage: 510.2+ KB


In [12]:
sentences = train_sample['question_text']

In [13]:
import re
# Lowercase, then replace any non-letter, space, or digit character in the sentences.
new_sentences = [re.sub(r'[^\w\s\d]','',h.lower()) for h in sentences]
# Replace sequences of whitespace with a space character.
new_sentences = [re.sub("\s+", " ", h) for h in new_sentences]

In [14]:
train_sample['new_sentences']=new_sentences

In [15]:
train_sample.head()

Unnamed: 0,qid,question_text,target,freq,new_sentences
587359,7310fac797f9d70d6877,Did GPU manufacturers shoot themselves in the ...,1,35,did gpu manufacturers shoot themselves in the ...
311670,3d0ea761fc29bab5afbc,How do we remove swallon face issue caused by ...,0,13,how do we remove swallon face issue caused by ...
46479,091a5807e358355509be,How much does LSAT matter for top Law School a...,0,28,how much does lsat matter for top law school a...
938895,b80070ff903d92818838,What are the simple home ways to minimize itch...,0,9,what are the simple home ways to minimize itching
1118636,db36c7a021307084e43e,Can I copy a thesis statement?,0,6,can i copy a thesis statement


In [16]:
stop_words = set(stopwords.words('English'))

In [17]:
train_sample['new_sentences'] = train_sample['new_sentences'].apply(lambda x: [item for item in str.split(x) if item not in stop_words])

In [18]:
train_sample.head()

Unnamed: 0,qid,question_text,target,freq,new_sentences
587359,7310fac797f9d70d6877,Did GPU manufacturers shoot themselves in the ...,1,35,"[gpu, manufacturers, shoot, foot, allowing, gp..."
311670,3d0ea761fc29bab5afbc,How do we remove swallon face issue caused by ...,0,13,"[remove, swallon, face, issue, caused, beer, a..."
46479,091a5807e358355509be,How much does LSAT matter for top Law School a...,0,28,"[much, lsat, matter, top, law, school, compare..."
938895,b80070ff903d92818838,What are the simple home ways to minimize itch...,0,9,"[simple, home, ways, minimize, itching]"
1118636,db36c7a021307084e43e,Can I copy a thesis statement?,0,6,"[copy, thesis, statement]"


In [19]:
train_sample['new_freq'] = train_sample['new_sentences'].apply(lambda x: len(x))
train_sample.head()

Unnamed: 0,qid,question_text,target,freq,new_sentences,new_freq
587359,7310fac797f9d70d6877,Did GPU manufacturers shoot themselves in the ...,1,35,"[gpu, manufacturers, shoot, foot, allowing, gp...",20
311670,3d0ea761fc29bab5afbc,How do we remove swallon face issue caused by ...,0,13,"[remove, swallon, face, issue, caused, beer, a...",8
46479,091a5807e358355509be,How much does LSAT matter for top Law School a...,0,28,"[much, lsat, matter, top, law, school, compare...",17
938895,b80070ff903d92818838,What are the simple home ways to minimize itch...,0,9,"[simple, home, ways, minimize, itching]",5
1118636,db36c7a021307084e43e,Can I copy a thesis statement?,0,6,"[copy, thesis, statement]",3


In [20]:
train_sample.target.value_counts()

0    12309
1      752
Name: target, dtype: int64

In [21]:
# Find all the unique words in the new_sentences.

from collections import Counter

unique_words = set(" ".join(new_sentences).split(" "))
def make_matrix(new_sentences, vocab):
    matrix = []
    for word in new_sentences:
        # Count each word in the sentence, and make a dictionary.
        counter = Counter(word)
        # Turn the dictionary into a matrix row using the vocab.
        row = [counter.get(w, 0) for w in vocab]
        matrix.append(row)
    df = pd.DataFrame(matrix)
    df.columns = unique_words
    return df

In [22]:
df = make_matrix(new_sentences, unique_words)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13061 entries, 0 to 13060
Columns: 18268 entries, luxury to ye
dtypes: int64(18268)
memory usage: 1.8 GB


# Machine Learning Phase

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    lowercase = False,
)
features = vectorizer.fit_transform(
    new_sentences
)
features_nd = features.toarray() # for easy usage

In [29]:
data_labels = train_sample['target']

In [31]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        data_labels,
        train_size=0.80, 
        random_state=1234)

## Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

In [34]:
log_model = log_model.fit(X=X_train, y=y_train)

In [35]:
y_pred = log_model.predict(X_test)

## Accuracy

In [37]:
import random
j = random.randint(0,len(X_test)-7)
for i in range(j,j+7):
    print(y_pred[0])
    ind = features_nd.tolist().index(X_test[i].tolist())
    print(new_sentences[ind].strip())

0
why does james altucher say that having bad credit is not a big deal is he wrong
0
is becoming a doctor for the sake of receiving a title worth it
0
was hinduism originated first before other religions
0
has the communist party of china ever failed in implementing its 5year plan
0
who would win in a fist fight between kanye west and van lathan
0
what are epidermal cells
0
how do i live a meaningful life


In [38]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9491006505931879
