<a href="https://colab.research.google.com/github/gshreya5/colab/blob/main/NLP_quora_insincere_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Quora Insincere Questions Classification


**GOAL** : Predict whether a question asked on Quora is sincere or not

# Load Libraries

In [6]:
import pandas as pd

import zipfile

# Load Dataset

In [3]:
!kaggle competitions download -c quora-insincere-questions-classification

100% 6.02G/6.03G [00:48<00:00, 93.6MB/s]
100% 6.03G/6.03G [00:48<00:00, 134MB/s] 


In [7]:
path_to_zip_file = '/content/quora-insincere-questions-classification.zip'
directory_to_extract_to = '/content/quora-insincere-questions-classification'
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

In [8]:
df = pd.read_csv('/content/quora-insincere-questions-classification/train.csv')

# Explore data

In [12]:
df.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [9]:
df.shape

(1306122, 3)

In [10]:
df.columns

Index(['qid', 'question_text', 'target'], dtype='object')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1306122 entries, 0 to 1306121
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   qid            1306122 non-null  object
 1   question_text  1306122 non-null  object
 2   target         1306122 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 29.9+ MB


In [82]:
df.isnull().sum().sum()

0

In [25]:
df.target.value_counts(normalize=True)

0    0.93813
1    0.06187
Name: target, dtype: float64

In [19]:
df[df.target==1].question_text.values[:5]

array(['Has the United States become the largest dictatorship in the world?',
       'Which babies are more sweeter to their parents? Dark skin babies or light skin babies?',
       "If blacks support school choice and mandatory sentencing for criminals why don't they vote Republican?",
       'I am gay boy and I love my cousin (boy). He is sexy, but I dont know what to do. He is hot, and I want to see his di**. What should I do?',
       'Which races have the smallest penis?'], dtype=object)

Create a Working Sample




In [20]:
SAMPLE_SIZE = 100_000

In [21]:
sample = df.sample(SAMPLE_SIZE,random_state=42)

In [23]:
sample.target.value_counts()

0    93962
1     6038
Name: target, dtype: int64

# Implement Bag of Words

## Create vocabulary and Configure text prepocessing using Count Vectorizer

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# text preprocessing

import nltk

# Step 1: word_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')
# Step 2: stop_words removal
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords = stopwords.words('english')
#Step 3: stemming
from nltk.stem import PorterStemmer

In [70]:
def text_preprocessor(text):
  return [PorterStemmer().stem(word) for word in word_tokenize(text) if word not in stopwords]

In [76]:
vectorizer = CountVectorizer(lowercase=True, tokenizer = text_preprocessor, max_features=1000 )
vectorizer.fit(sample.question_text)



In [72]:
vectorizer.get_feature_names_out()

array(['!', '$', '%', '&', "'", "''", "'m", "'re", "'s", "'ve", '(', ')',
       ',', '-', '.', '1', '10', '100', '12', '12th', '15', '2', '20',
       '2017', '2018', '3', '30', '4', '5', '6', '7', '8', ':', '?', '[',
       ']', '``', 'abl', 'abroad', 'abus', 'accept', 'access',
       'accomplish', 'accord', 'account', 'achiev', 'act', 'action',
       'activ', 'actor', 'actual', 'ad', 'add', 'address', 'admiss',
       'adult', 'advanc', 'advantag', 'advic', 'affect', 'africa',
       'african', 'age', 'agre', 'air', 'allow', 'almost', 'alon',
       'alreadi', 'also', 'altern', 'alway', 'amazon', 'america',
       'american', 'amount', 'analysi', 'android', 'anim', 'anoth',
       'answer', 'anxieti', 'anyon', 'anyth', 'apart', 'app', 'appear',
       'appl', 'appli', 'applic', 'approach', 'arab', 'area', 'armi',
       'around', 'art', 'asian', 'ask', 'associ', 'atheist', 'attack',
       'attend', 'attract', 'australia', 'avail', 'averag', 'avoid',
       'away', 'b', 'babi', 'b

## Transform text to vectors using Count Vectorizer

In [77]:
inputs = vectorizer.transform(sample.question_text)

In [78]:
inputs.shape

(100000, 1000)

In [86]:
inputs.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [105]:
test = pd.read_csv('/content/quora-insincere-questions-classification/test.csv')

In [84]:
test_inputs = vectorizer.transform(test.question_text)

# ML for Text Classification

### Split into train and test set

In [88]:
from sklearn.model_selection import train_test_split

In [90]:
train_feature, test_feature, train_target, test_target =  train_test_split(inputs,sample.target, test_size = 0.2, random_state=42)

In [91]:
train_feature.shape

(80000, 1000)

### Train a logistic regression model

In [92]:
from sklearn.linear_model import LogisticRegression

In [97]:
clf = LogisticRegression(max_iter = 2000, solver='sag' ).fit(train_feature, train_target)



### Predict on test data

In [102]:
from sklearn.metrics import accuracy_score, f1_score

In [101]:
accuracy_score(test_target,clf.predict(test_feature) )

0.9456

In [103]:
f1_score(test_target,clf.predict(test_feature))

0.3853107344632768

# Submission

In [108]:
pred = clf.predict(test_inputs)

In [111]:
test['prediction'] = pred

In [113]:
del test['question_text']

In [116]:
test.to_csv('submission.csv', index=None)