# Import

In [None]:
import os
import numpy as np
import pandas as pd
import spacy

import matplotlib.pyplot as plt
import seaborn as sns

sns.set
%matplotlib inline

# Load the dataset

In [None]:
path = '../input'
df_train = pd.read_csv(os.path.join(path, 'train.csv'))
df_test = pd.read_csv(os.path.join(path, 'test.csv'))

# Look at the dataset

In [None]:
df_train.head()

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.info()

The data is clean, there is no Naan values

In [None]:
df_train['target'].value_counts().plot(kind='bar');

In [None]:
insincere_ratio = (80810 / 1225312) * 100
insincere_ratio

In [None]:
y = df_train['target']
X = df_train['question_text']

In [None]:
X_insincere = X[y == 1]
X_insincere.head()

We can already notice the troll content within the questions.

In [None]:
X_sincere = X[y == 0]
X_sincere.head()

Whereas within the sincere question, the questions are legit.

# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Tokenizing

Creation of a tokenize's function permitting to automatically tokenize our train and test set

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
def tokenize(data):
    corpus = [word_tokenize(token) for token in data]
    lowercase_train = [[token.lower() for token in doc] for doc in corpus]
    alphas = [[token for token in doc if token.isalpha()] for doc in lowercase_train]
    stop_words = stopwords.words('english')
    train_no_stop = [[token for token in doc if token not in stop_words] for doc in alphas]
    stemmer = PorterStemmer()
    stemmed = [[stemmer.stem(token) for token in doc] for doc in train_no_stop]
    train_clean_str = [ ' '.join(doc) for doc in stemmed]
    return train_clean_str

In [None]:
X_train = tokenize(X_train)
X_test = tokenize(X_test)

# Machine Learning

## Preprocessing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
tvec = TfidfVectorizer(stop_words='english')
cvec = CountVectorizer(stop_words='english')
svd = TruncatedSVD(n_components=100, random_state=42)

## Machine learning models

### MultinomialNB

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [None]:
mnb = MultinomialNB()

In [None]:
pipe = Pipeline([('vectorizer', cvec), ('mnb', mnb)])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred = pipe.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
labels = ['sincere', 'unsincere']
cm = pd.DataFrame(cm, columns=labels, index=labels)
cm

## Validation

In [None]:
from sklearn.model_selection import cross_val_score
cv = cross_val_score(pipe, X_test, y_test, scoring='f1_macro', cv=5).mean()

In [None]:
cv

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_test, y_pred)
print(cr)

# Test

In [None]:
test = df_test['question_text']
test = tokenize(test)

## Machine Learning

In [None]:
y_pred = pipe.predict(test)
y_pred

# Submit on the sample submission

In [None]:
path = '../input'
df_sub = pd.read_csv(os.path.join(path, 'sample_submission.csv'))

In [None]:
df_sub['prediction'] = y_pred

In [None]:
df_sub.to_csv("submission.csv", index=False)