# Intro

There are so many insincere and inaccurate questions I saw on Quora. Even the questioners themselves not sure about what they are asking. I'm here to explore the questions text and implement text mining to find a way to build NLP Model classify the insincere question from sincere one.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

import datetime
from string import punctuation

In [None]:
train = pd.read_csv('../input/quora-insincere-questions-classification/train.csv')

In [None]:
train.head()

In [None]:
sum(train.target)/len(train)

### Only 6.18% of the data are target 1

In [None]:
train = train.drop('qid',axis=1)

In [None]:
# preprocess text for further use
train['question_text'] = train.question_text.apply(lambda x: x.lower())
train['question_text'] = train.question_text.apply(lambda x: ''.join([c for c in x if c not in punctuation]))

In [None]:
train['question_length'] = train.question_text.apply(lambda x: len(x))

In [None]:
print('The max length of question is:',train.question_length.max())
print('The minimum length of question is:',train.question_length.min())
print('The mean length of question is:',train.question_length.mean())
print('The max standard deviation of question is:',train.question_length.std())

In [None]:
train.question_length.hist(bins=50)
plt.title('The distribution of length of questions')
plt.axvline(np.mean(train.question_length),color='y')

In [None]:
train['question_length_scaled'] = train.question_length.apply(lambda x: np.log(x+1))

In [None]:
train.describe()

In [None]:
train.question_length_scaled.hist(bins=50)

In [None]:
sns.boxplot(train.target,train.question_length_scaled)
plt.title('The distribution of length of target or not')

In [None]:
Q = np.array(Image.open('../input/quora-logo1/quora-logo-rubber-stamp.png'))

In [None]:
np.random.seed(321)
sns.set(rc={'figure.figsize':(14,8)})
reviews = ' '.join(train['question_text'].tolist())

wordcloud = WordCloud(mask=Q,background_color="white").generate(reviews)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.title('Questions',size=20)
plt.show()

I'll split the data into target 1 and target 0 to see what's different in their word clouds.

In [None]:
train_1 = train[train.target == 1]
train_0 = train[train.target == 0]

In [None]:
reviews = ' '.join(train_1['question_text'].tolist())

wordcloud = WordCloud(mask=Q,background_color="white").generate(reviews)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.title('Target 1',size=20)
plt.show()

What I saw is:

* Donal Trump
* liberal
* india
* muslim
* american
* christian
* conservative

In [None]:
reviews = ' '.join(train_0['question_text'].tolist())

wordcloud = WordCloud(mask=Q,background_color="white").generate(reviews)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.margins(x=0, y=0)
plt.title('Target 0',size=20)
plt.show()

One thing that catch my eyes is that both of the word clouds have india. But the ratios are different. While in target 1, there are trump, liberal, muslim, and american in there. I'll use counter to find more insights.

In [None]:
from collections import Counter

text = ' '.join(train['question_text'].tolist())
question_word = text.split(' ')
all_question = ' '.join(question_word)
words = all_question.split()

# words wrong datatype
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

questions_ints = []
for questions in question_word:
    questions_ints.append([vocab_to_int[word] for word in questions.split()])

In [None]:
print('Unique words: ', len((vocab_to_int)))

In [None]:
counts.most_common(20)

In [None]:
text = ' '.join(train_0['question_text'].tolist())
question_word = text.split(' ')
all_question = ' '.join(question_word)
words = all_question.split()

# words wrong datatype
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

questions_ints = []
for questions in question_word:
    questions_ints.append([vocab_to_int[word] for word in questions.split()])

In [None]:
counts.most_common(20)

In [None]:
text = ' '.join(train_1['question_text'].tolist())
question_word = text.split(' ')
all_question = ' '.join(question_word)
words = all_question.split()

# words wrong datatype
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

questions_ints = []
for questions in question_word:
    questions_ints.append([vocab_to_int[word] for word in questions.split()])

In [None]:
counts.most_common(20)

In [None]:
train.head()

As you see, the most of the words are not really meaningful to us.

In [None]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english')) 

train['question_text'] = train.question_text.apply(lambda x: word_tokenize(x))

train['question_text'] = train.question_text.apply(lambda x: [w for w in x if w not in stop_words])

In [None]:
train.head()

In [None]:
train['question_text'] = train.question_text.apply(lambda x: ' '.join(x))

In [None]:
train.head()

In [None]:
text = ' '.join(train['question_text'].tolist())
question_word = text.split(' ')
all_question = ' '.join(question_word)
words = all_question.split()

# words wrong datatype
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

questions_ints = []
for questions in question_word:
    questions_ints.append([vocab_to_int[word] for word in questions.split()])

In [None]:
print('Unique words: ', len((vocab_to_int)))

In [None]:
counts.most_common(20)

In [None]:
train['question_text'] = train['question_text'].apply(lambda x: x.replace('’', ""))

In [None]:
train_1 = train[train.target == 1]
train_0 = train[train.target == 0]

In [None]:
text = ' '.join(train_0['question_text'].tolist())
question_word = text.split(' ')
all_question = ' '.join(question_word)
words = all_question.split()

# words wrong datatype
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

questions_ints = []
for questions in question_word:
    questions_ints.append([vocab_to_int[word] for word in questions.split()])

In [None]:
counts.most_common(20)

In [None]:
text = ' '.join(train_1['question_text'].tolist())
question_word = text.split(' ')
all_question = ' '.join(question_word)
words = all_question.split()

# words wrong datatype
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

questions_ints = []
for questions in question_word:
    questions_ints.append([vocab_to_int[word] for word in questions.split()])

In [None]:
counts.most_common(20)

Well, this is a good result. I can tell the difference of target 1 from target 0.

But I'll remove the duplicated words in both part

In [None]:
dup_words = ['people','would','get','like','india','think', 'many']

train['question_text'] = train.question_text.apply(lambda x: word_tokenize(x))

train['question_text'] = train.question_text.apply(lambda x: [w for w in x if w not in dup_words])

train['question_text'] = train.question_text.apply(lambda x: ' '.join(x))

train_1 = train[train.target == 1]
train_0 = train[train.target == 0]

In [None]:
text = ' '.join(train_0['question_text'].tolist())
question_word = text.split(' ')
all_question = ' '.join(question_word)
words = all_question.split()

# words wrong datatype
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

questions_ints = []
for questions in question_word:
    questions_ints.append([vocab_to_int[word] for word in questions.split()])

In [None]:
counts.most_common(20)

In [None]:
text = ' '.join(train_1['question_text'].tolist())
question_word = text.split(' ')
all_question = ' '.join(question_word)
words = all_question.split()

# words wrong datatype
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

questions_ints = []
for questions in question_word:
    questions_ints.append([vocab_to_int[word] for word in questions.split()])

In [None]:
counts.most_common(20)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

X_train, X_test, y_train, y_test = train_test_split(train["question_text"], train['target'], test_size=0.33
                                    ,random_state=53)

# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words="english")

# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)

y_train = np.asarray(y_train.values)

ch2 = SelectKBest(chi2, k = 300)

X_new = ch2.fit_transform(count_train, y_train)

# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)

X_test_new = ch2.transform(X=count_test)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

clf = RandomForestClassifier()
# Fit the classifier to the training data
clf.fit(X_new, y_train)

# Create the predicted tags: pred
pred = clf.predict(X_test_new)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print('Accuracy is:',score)
f1 = metrics.f1_score(y_test, pred)
print('F score is:',f1)

In [None]:
sns.heatmap(metrics.confusion_matrix(pred,y_test),annot=True,fmt='2.0f')

In [None]:
clf = RandomForestClassifier()
# Fit the classifier to the training data
clf.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = clf.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print('Accuracy is:',score)
f1 = metrics.f1_score(y_test, pred)
print('F score is:',f1)

In [None]:
sns.heatmap(metrics.confusion_matrix(pred,y_test),annot=True,fmt='2.0f')

In [None]:
from sklearn.naive_bayes import MultinomialNB


# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(X_new, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(X_test_new)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print('Accuracy is:',score)
f1 = metrics.f1_score(y_test, pred)
print('F score is:',f1)

In [None]:
sns.heatmap(metrics.confusion_matrix(pred,y_test),annot=True,fmt='2.0f')

In [None]:
from sklearn.naive_bayes import MultinomialNB


# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print('Accuracy is:',score)
f1 = metrics.f1_score(y_test, pred)
print('F score is:',f1)

In [None]:
sns.heatmap(metrics.confusion_matrix(pred,y_test),annot=True,fmt='2.0f')

In [None]:
from sklearn import svm

clf = svm.SVC()

clf.fit(X_new, y_train)

pred = clf.predict(X_test_new)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print('Accuracy is:',score)
f1 = metrics.f1_score(y_test, pred)
print('F score is:',f1)

In [None]:
clf = svm.SVC()

clf.fit(tfidf_train, y_train)

pred = clf.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print('Accuracy is:',score)
f1 = metrics.f1_score(y_test, pred)
print('F score is:',f1)

In [None]:
test = pd.read_csv('../input/quora-insincere-questions-classification/test.csv')

In [None]:
test = test.drop('qid',axis=1)

test['question_text'] = test.question_text.apply(lambda x: x.lower())
test['question_text'] = test.question_text.apply(lambda x: ''.join([c for c in x if c not in punctuation]))

In [None]:
test['question_text'] = test.question_text.apply(lambda x: word_tokenize(x))

test['question_text'] = test.question_text.apply(lambda x: [w for w in x if w not in stop_words])

test['question_text'] = test.question_text.apply(lambda x: [w for w in x if w not in dup_words])

In [None]:
test['question_text'] = test.question_text.apply(lambda x: ' '.join(x))

In [None]:
# Transform the training data using only the 'text' column values: count_train 
count = count_vectorizer.transform(test.question_text)

X = ch2.transform(count)

In [None]:
y_pred = clf.predict(X)

In [None]:
submission = pd.read_csv('../input/quora-insincere-questions-classification/sample_submission.csv')

In [None]:
submission['prediction'] = y_pred

In [None]:
submission.to_csv('submission.csv')