In [1]:
# !pip install contractions

In [2]:
# !python3 -m nltk.downloader wordnet

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Libraries to drop english words and tokenize the text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

# import contractions
import string

# Libraries for TF-IDF model and Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv


In [4]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

In [5]:
# nltk.download('stopwords')
# nltk.download('wordnet')

---
# <font color=green>Preprocess the data</font>

In [6]:
# DataFrame with Test data
test_df = pd.read_csv(f'/kaggle/input/quora-insincere-questions-classification/test.csv')
# DataFrame with Train Data
train_df = pd.read_csv(f'/kaggle/input/quora-insincere-questions-classification/train.csv')

In [7]:
#  The Function for text preprocessing
def tokenize_string(text):

    # Before lemmatizing replace all constructions with normal words
#     text_upd = contractions.fix(text)

    # Tokenize the data and use only lower letters
    words = word_tokenize(text.lower())
    
    # Create a lemmatizer object
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word, pos = "v") for word in words] 
    
    # # Get rid of punctuation
    words = [word for word in lemmas if word not in string.punctuation]
    
    # Remove stop words
    # Stop words corpus (179 in total)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    return words

In [8]:
# Tokenizing the series for test data
print('I have just started updatind test_df')
test_df['Preprocessed_text'] = test_df.question_text.apply(tokenize_string)

# Tokenizing the series for train data
print('I have just started updatind train_df')
train_df['Preprocessed_text'] = train_df.question_text.apply(tokenize_string)

I have just started updatind test_df
I have just started updatind train_df


In [9]:
test_df.head()

Unnamed: 0,qid,question_text,Preprocessed_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...,"[many, women, become, rude, arrogant, get, lit..."
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...,"[apply, rv, college, engineer, bms, college, e..."
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...,"[really, like, nurse, practitioner]"
3,000086e4b7e1c7146103,Who are entrepreneurs?,[entrepreneurs]
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?,"[education, really, make, good, people, nowadays]"


---
# <font color=green>Creating baseline model based on Logistic Regression</font>

In [10]:
train_df['text'] = train_df.Preprocessed_text.apply(lambda x: ' '.join(x))
test_df['text'] = test_df.Preprocessed_text.apply(lambda x: ' '.join(x))

In [11]:
# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

In [12]:
# Vectorize the sentences
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

y_train = train_df.target.values

In [13]:
# Train a logistic regression model on the vectorized data
clf = LogisticRegression(random_state=0, solver='liblinear').fit(X_train, y_train)

In [14]:
# Make a prediction on new data
y_pred = clf.predict(X_test)

---
# <font color=green>Making the final file</font>

In [15]:
pd.DataFrame({'qid': test_df.qid, 'prediction': y_pred}).set_index('qid').to_csv('submission.csv')