# Wikipedia Toxicity.

In [None]:
# importing pandas package
import pandas as pd

In [None]:
# 1.Load the data using read_csv function from pandas package
data = pd.read_csv('train.csv')
data.head()

In [None]:
# get data information
data.info()

In [None]:
# check the missing values
data.isnull().sum()

In [None]:
# 2.Get the comments into a list, for easy text cleanup and manipulation
comments = data["comment_text"]
comments.head()

In [None]:
# 3.Cleanup
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [None]:
# import stopwords from nltk
nltk_stopwords = set(stopwords.words('english'))

In [None]:
# import stopwords from sklearn
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
sklearn_stopwords = set(ENGLISH_STOP_WORDS)

In [None]:
# Combining the stopwords from sklearn & NLTK
combined_stopwords = nltk_stopwords.union(sklearn_stopwords)

In [None]:
# Create a function to clean the text
def comment_clear(text):
    # Using regular expressions, remove IP addresses
    new_text = re.sub(r"((\d{1,3}\.){3}\d{1,3})$", "", text)
    # Using regular expressions, remove URLs
    new_text = re.sub(r"http\S+", "", new_text)
    # Normalize the casing
    new_text = new_text.lower().strip()
    # Remove punctuation
    new_text = re.sub(r"[^a-zA-Z]", " ", new_text)
    # Tokenize using word_tokenize from NLTK
    new_text = [token for token in word_tokenize(new_text) if token not in combined_stopwords]
    # removing words less than two charactors
    new_text = [token for token in new_text if (len(token)>2)] 
    
    cleaned_text = ''
    for token in new_text:
        cleaned_text = cleaned_text + lemmatizer.lemmatize(token) + ' '
    
    return cleaned_text.strip()

In [None]:
clean_comments = list(comments.apply(comment_clear))
clean_comments

In [None]:
data['cleaned_comment_with_SW'] = clean_comments
data.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
all_words = []
for t in data['cleaned_comment_with_SW']:
    all_words.extend(t.split())

# Frequency Distribution
freq_dist = nltk.FreqDist(all_words)

plt.figure(figsize=(12,5))
plt.title('Top 25 most common words')
plt.xticks(fontsize=15)

freq_dist.plot(25, cumulative=False)
plt.show()

In [None]:
# 4.Using a counter, find the top terms in the data. 
#    Can any of these be considered contextual stop words? 
#    Words like “Wikipedia”, “page”, “edit” are examples of contextual stop words
#    If yes, drop these from the data

In [None]:
# consider contextual stop words
cleaned_comment_words = []
for item in clean_comments:
    cleaned_comment_words = cleaned_comment_words + item.split()

comment_freq = nltk.FreqDist(cleaned_comment_words)

In [None]:
# display the common 200 words by frequency
comment_freq.most_common(200)

In [None]:
# find the contextual stop words in the data. 
domain_stopwords = ['article', 'page','wikipedia', 'edit', 'user','image' ]

In [None]:
# Create a function to clean the text with the contextual stop words
def comment_clear_contextual_stopwords (text):
    # Tokenize using word_tokenize from NLTK
    new_text = [token for token in text.split() if token not in domain_stopwords]
    
    cleaned_text = ''
    for token in new_text:
        cleaned_text = cleaned_text + token + ' '
    
    return cleaned_text.strip()

In [None]:
# apply the clearner function to clear text with contextual stop words
clean_comments_domain = list(data['cleaned_comment_with_SW']
                             .apply(comment_clear_contextual_stopwords))
clean_comments_domain


In [None]:
# add new column with clean comments 
data['cleaned_comment'] = clean_comments_domain

In [None]:
# display the details of data
data.head()

In [None]:
all_words = []
for t in data['cleaned_comment']:
    all_words.extend(t.split())

# Frequency Distribution
freq_dist = nltk.FreqDist(all_words)

plt.figure(figsize=(12,5))
plt.title('Top 25 most common words')
plt.xticks(fontsize=15)

freq_dist.plot(25, cumulative=False)
plt.show()

In [None]:
# 5.Separate into train and test sets

# import the split package
from sklearn.model_selection import train_test_split

In [None]:
# create X & Y dataset
X = data['cleaned_comment']
y = data['toxic']

In [None]:
# create the train and test sets with 70-30 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [None]:
# 6.Use TF-IDF values for the terms as feature to get into a vector space model

#   Import TF-IDF vectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#   Instantiate with a maximum of 4000 terms in your vocabulary
TFIDF = TfidfVectorizer( min_df=5, max_features=4000)

In [None]:
#   Fit and apply on the train set
X_train_vectorizer = TFIDF.fit_transform(X_train)

In [None]:
#   display the feature names 
print(TFIDF.get_feature_names())

In [None]:
print(len(TFIDF.get_feature_names()))

In [None]:
#   Apply on the test set
X_test_vectorizer = TFIDF.transform(X_test)

In [None]:
#  Apply transform to the X
X_vectorize = TFIDF.transform(X)

In [None]:
# 7.Model building: Support Vector Machine

#   Instantiate SVC from sklearn with a linear kernel
from sklearn.svm import SVC
svc = SVC(kernel='linear', random_state=1)

In [None]:
#   Fit on the train data
svc.fit(X_train_vectorizer, y_train)

In [None]:
#   Make predictions for the train and the test set
#   Predict Y train
Y_train_pred = svc.predict(X_train_vectorizer)
Y_train_pred

In [None]:
#  predict Y Test
Y_test_pred = svc.predict(X_test_vectorizer)
Y_test_pred

In [None]:
# 8.Model evaluation: Accuracy, recall, and f1_score

# import pakages for metrics and reporting
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [None]:
#   Report the accuracy on the train set
accuracy_score(y_train, Y_train_pred)*100

In [None]:
#   Get the f1_score on the train set
print(classification_report(y_train, Y_train_pred))

In [None]:
#   Report the recall on the train set:decent, high, low?
# recall   - 0 class value is 1 (high), 1 class value is 0.71 (low)
# f1-score - 0 class value is 0.99(high), 1 cluse value is 0.83 (decent) 

In [None]:
print(classification_report(y_test, Y_test_pred))

In [None]:
# 9. Looks like you need to adjust  the class imbalance, as the model seems to focus on the 0s

#    Adjust the appropriate parameter in the SVC module
svc1 = SVC(kernel='linear', class_weight='balanced', random_state=1)

In [None]:
# 10. Train again with the adjustment and evaluate

#     Train the model on the train set
svc1.fit(X_train_vectorizer, y_train)

In [None]:
#  Evaluate the predictions on the validation set: accuracy, recall, f1_score
#  predict Y Train
Y_train_pred1 = svc1.predict(X_train_vectorizer)
Y_train_pred1

In [None]:
#  predict Y Test
Y_test_pred1 = svc1.predict(X_test_vectorizer)
Y_test_pred1

In [None]:
#   Report the accuracy of the train set
accuracy_score(y_train, Y_train_pred1)*100

In [None]:
#   Get the f1_score of the train set
print(classification_report(y_train, Y_train_pred1))

In [None]:
#   Get the f1_score of the test set
print(classification_report(y_test, Y_test_pred1))

In [None]:
#11. Hyperparameter tuning

#Import GridSearch and StratifiedKFold (because of class imbalance)
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
# create a SVC model
svc_Hy = SVC(kernel='linear', class_weight='balanced', random_state=1)

In [None]:
# apply TF-IDF vectorizer to all comments
X_vectorizer = TFIDF.fit_transform(data['cleaned_comment'])
y = data['toxic']

In [None]:
# Provide the parameter grid to choose for ‘C’
C_values = np.arange(0.00001, 1, 0.05) 

In [None]:
# create the SKFold 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Use a balanced class weight while instantiating the Support Vector Classifier
grid = GridSearchCV(estimator=svc_Hy, param_grid={'C': C_values}, cv=kfold, scoring='accuracy', 
                    return_train_score=True, verbose=2, n_jobs=-1)
grid_results = grid.fit(X_vectorizer,y)

In [None]:
# display best C parameter and scores
grid_results.best_params_, grid_results.best_score_, grid_results.best_index_

In [None]:
grid_results.cv_results_['mean_test_score']

In [None]:
grid_results.cv_results_['mean_train_score'][grid_results.best_index_]*100

In [None]:
grid_results.cv_results_['mean_test_score'][grid_results.best_index_]*100

In [None]:
grid_results.cv_results_['std_test_score'][grid_results.best_index_]*100

In [None]:
plt.plot(grid_results.cv_results_['mean_train_score'] - grid_results.cv_results_['mean_test_score'])

In [None]:
#12. Find the parameters with the best recall in cross validation

#  Choose ‘recall’ as the metric for scoring

grid_recall = GridSearchCV(estimator=svc_Hy, param_grid={'C': C_values}, cv=kfold, scoring='recall', 
                    return_train_score=True, verbose=2, n_jobs=-1)

In [None]:
# Choose stratified 5 fold cross validation scheme 
grid_results_recall = grid_recall.fit(X_vectorizer,y)

In [None]:
grid_results_recall.best_params_, grid_results_recall.best_score_
, grid_results_recall.best_index_

In [None]:
grid_results_recall.cv_results_['mean_train_score'][grid_results_recall.best_index_]*100


In [None]:
grid_results_recall.cv_results_['mean_test_score'][grid_results_recall.best_index_]*100

In [None]:
grid_results_recall.cv_results_['std_test_score'][grid_results_recall.best_index_]*100

In [None]:
plt.plot(grid_results_recall.cv_results_['mean_train_score'] 
         - grid_results_recall.cv_results_['mean_test_score'])

In [None]:
#13. What are the best parameters?
# The best C parameter is 0.05. Mean train score and mean test score difference is 1.7359.

In [None]:
#14. Predict and evaluate using the best estimator

#   What is the recall on the test set for the toxic comments?
#   What is the f1_score?

In [None]:
#   Use best estimator from the grid search to make predictions on the test set
model_final = SVC(kernel='linear', C=0.050010000000000006, class_weight='balanced', random_state=1)

In [None]:
# fit the model
model_final.fit(X_train_vectorizer, y_train)

In [None]:
# predict the values from the model
y_train_predict_final = model_final.predict(X_train_vectorizer)
y_train_predict_final

In [None]:
# predict the values from the model
y_test_predict_final = model_final.predict(X_test_vectorizer)
y_test_predict_final

In [None]:
#   Report the accuracy on the test set
accuracy_score(y_train, y_train_predict_final)*100

In [None]:
print(classification_report(y_train, y_train_predict_final))

In [None]:
print(classification_report(y_test, y_test_predict_final))

In [None]:
classification_report(y_test, y_test_predict_final, output_dict=True)['weighted avg']['recall']*100

In [None]:
classification_report(y_test, y_test_predict_final, output_dict=True)['weighted avg']['f1-score']*100

In [None]:
#15. What are the most prominent terms in the toxic comments?

In [None]:
#    Separate the comments from the test set that the model identified as toxic
X_test[y_test_predict_final==1]

In [None]:
#    Make one large list of the terms
merged_text = []
for item in X_test[y_test_predict_final==1]:
    merged_text = merged_text + item.split()
    
frequency_words = nltk.FreqDist(merged_text)

In [None]:
 frequency_words.most_common(100)

In [None]:
#    Get the top 15 terms
frequency_words.most_common(15)