In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import string
# Libraries to drop english words and tokenize the text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/quora-insincere-questions-classification/sample_submission.csv
/kaggle/input/quora-insincere-questions-classification/embeddings.zip
/kaggle/input/quora-insincere-questions-classification/train.csv
/kaggle/input/quora-insincere-questions-classification/test.csv


In [2]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr

---
# <font color=green>Preprocess the data</font>

In [3]:
# DataFrame with Test data
test_df = pd.read_csv(f'/kaggle/input/quora-insincere-questions-classification/test.csv')
# DataFrame with Train Data
train_df = pd.read_csv(f'/kaggle/input/quora-insincere-questions-classification/train.csv')

In [4]:
#  The Function for text preprocessing
def tokenize_string(text):

    # Before lemmatizing replace all constructions with normal words
#     text_upd = contractions.fix(text)

    # Tokenize the data and use only lower letters
    words = word_tokenize(text.lower())
    
    # Create a lemmatizer object
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word, pos = "v") for word in words] 
    
    # # Get rid of punctuation
    words = [word for word in lemmas if word not in string.punctuation]
    
    # Remove stop words
    # Stop words corpus (179 in total)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    return words

In [5]:
# Tokenizing the series for train data
print('I have just started updatind train_df')
train_df['Preprocessed_text'] = train_df.question_text.apply(tokenize_string)

# Tokenizing the series for test data
print('I have just started updatind test_df')
test_df['Preprocessed_text'] = test_df.question_text.apply(tokenize_string)

I have just started updatind train_df
I have just started updatind test_df


In [6]:
y_train = train_df.target.values

In [7]:
train_df['Preprocessed_text'].head()

0    [quebec, nationalists, see, province, nation, ...
1    [adopt, dog, would, encourage, people, adopt, ...
2    [velocity, affect, time, velocity, affect, spa...
3    [otto, von, guericke, use, magdeburg, hemisphe...
4    [convert, montra, helicon, mountain, bike, cha...
Name: Preprocessed_text, dtype: object

In [8]:
word2vec_model = Word2Vec(train_df['Preprocessed_text'], vector_size=100, window=5, min_count=3)

In [9]:
# Function for preprocessing the whole phrase instead of separate phrases
def phrase_vector(word2vec_model, phrase):
    phrase = [word for word in phrase if word in word2vec_model.wv.key_to_index]
    
    if len(phrase) == 0:
        return np.zeros(word2vec_model.vector_size)
    
    return np.mean(word2vec_model.wv[phrase], axis=0)

In [10]:
# Applying the function
X_train_vec = np.array([phrase_vector(word2vec_model, phrase) for phrase in train_df['Preprocessed_text']])
X_test_vec = np.array([phrase_vector(word2vec_model, phrase) for phrase in test_df['Preprocessed_text']])

---
# <font color=green>Creating baseline model based on Logistic Regression</font>

In [11]:
log_reg = LogisticRegression()
log_reg.fit(X_train_vec, y_train)

In [12]:
# Make a prediction with new data
y_pred = log_reg.predict(X_test_vec)

---
# <font color=green>Creating baseline model based on Catboost</font>

In [13]:
# model = CatBoostClassifier(iterations=10000, learning_rate=0.1, depth=6, loss_function='Logloss', random_seed=42)
# model.fit(X_train, y_train, verbose=10)

In [14]:
# y_pred = model.predict(X_test)

In [15]:
# y_pred = y_pred.ravel()

---
# <font color=green>Making the final file</font>

In [16]:
pd.DataFrame({'qid': test_df.qid, 'prediction': y_pred}).set_index('qid').to_csv('submission.csv')