In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import re

In [None]:
data = pd.read_csv('YT_data.csv')

In [None]:
data.head()

In [None]:
data.drop('IsHomophobic',axis = 1,inplace = True)
data.drop('IsRadicalism',axis = 1,inplace = True)
data.drop('CommentId',axis = 1,inplace = True)
data.drop('VideoId',axis = 1,inplace = True)

In [None]:
data.head()

In [None]:
## USING REGEX REMOVE STOPWORDS
## CURRENTLY REMOVING STOPWORDS THROUGH NLTK
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def stopword_removal(txt):
    stpwrds = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b') 
    txt = stpwrds.sub('', txt)
    li = list(txt.split(" "))
    return li
    #\s*

data["STOP"] = data["Text"].apply(lambda x:stopword_removal(x))
data.head()

In [None]:
nostop = []
for i in range(len(data)):
    nostop.append(' '.join(data["STOP"][i]))
data["NOSTOP"] = nostop
data.drop(['STOP'],axis=1)

In [None]:
'''
C. LEMMATIZATION
importing WordNetLemmatizer from NLTK
'''
from nltk.tokenize import sent_tokenize,word_tokenize
lemmatizer = WordNetLemmatizer()
def lemmatize(txt):
    lemm = [lemmatizer.lemmatize(word) for word in word_tokenize(txt)]
    return lemm

data["lemm"] = data["NOSTOP"].apply(lambda x:lemmatize(x))
lemma = []
for i in range(len(data)):
    lemma.append(' '.join(data["lemm"][i]))
data["LEMM"] = lemma

In [None]:
data.head()

In [None]:
data.drop(columns = ["Text" , "STOP" , "NOSTOP" , "lemm"] , inplace = True)

In [None]:
type(data)

In [None]:
data.to_csv('out.csv') 

In [None]:
Y = data['IsToxic']
X = data['LEMM']
X_train_bow, X_test_bow, Y_train , Y_test = train_test_split(X,Y)

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train_bow)
X_train_bow = vectorizer.transform(X_train_bow)
X_test_bow = vectorizer.transform(X_test_bow)

APPLYING SOME WELL KNOWN CLASSIFIER ALGORITHMS

In [None]:
def LogisticClassifier(X_train,Y_train,X_test,Y_test):
#     clf = LogisticRegression()
#     clf.fit(X_train,Y_train)
#     y_pred = clf.predict(X_test)
#     return accuracy_score(y_pred,Y_test)

    solvers = ['lbfgs' , 'liblinear' , 'newton-cg','sag','saga']
    penalty = ['l1', 'l2','elasticnet', 'none']
    max_iter = [100, 1000,2500, 5000]
    c_values = [0.01 , 0.1 , 0.45 , 0.5  , 0.65 , 1 , 1.5 , 10 , 100 , 1000]
    clf = LogisticRegression()

    params_grid = {'solver' : solvers , 'penalty' : penalty , 'C' : c_values , 'max_iter' : max_iter}

    gS = GridSearchCV(clf , param_grid = params_grid , scoring = 'accuracy', n_jobs = -1)
    gS_ = gS.fit(X_train,Y_train)
    y_pred = gS_.predict(X_test)
    accuracy_score(y_pred , Y_test)
    
    
def MultinomialNaiveClassifier(X_train,Y_train,X_test,Y_test):
    clf = MultinomialNB()
    clf.fit(X_train, Y_train)
    y_pred = clf.predict(X_test)
    return accuracy_score(y_pred,Y_test)


def GaussianNaiveClassifier(X_train,Y_train,X_test,Y_test):
#     clf = GaussianNB()
#     clf.fit(X_train,Y_train)
#     y_pred = clf.predict(X_test)
#     return accuracy_score(y_pred,Y_test)

    cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=999)

    params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}

    clf_gnb = GaussianNB()

    gs_NB = GridSearchCV(estimator=clf_gnb, param_grid=params_NB, cv=cv_method, verbose=1, scoring='accuracy')

    # Data_transformed = PowerTransformer().fit_transform(X_temp)
    
#     X_train = X_train.toarray()
#     Y_train = Y_train.to_numpy()
#     X_test = X_test.toarray()
#     Y_test = Y_test.to_numpy()
    
    gs_NB.fit(X_train , Y_train)
    
    predict_test = gs_NB.predict(X_test)

    # Accuracy Score on test dataset
    accuracy_test = accuracy_score(Y_test,predict_test)
    return accuracy_test

def DecisionTree(X_train,Y_train,X_test,Y_test):
    clf = DecisionTreeClassifier(random_state=0)
    parameter_dict = {
        "criterion":["gini","entropy"],
        "max_depth":[5,6,7,8,9,10,11,12],
        "min_samples_split":[2,3,4,5],
        "min_samples_leaf":[1,2,3,4,5]
    }
    grid = GridSearchCV(clf,param_grid=parameter_dict,cv=10,verbose=1,n_jobs=1)
    grid.fit(X_train,Y_train)
    
    y_pred = grid.predict(X_test)
    return accuracy_score(y_pred, Y_test)

def RandomForest(X_train,Y_train,X_test,Y_test):
#     clf = RandomForestClassifier(max_depth=10,random_state=0)
#     clf.fit(X_train,Y_train)
#     y_pred = clf.predict(X_test)
#     return accuracy_score(y_pred,Y_test)
    random_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
               'max_features': ['auto', 'sqrt'],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] }
    rf = RandomForestClassifier()
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train,Y_train)
    
    y_pred = rf_random.predict(X_test)
    return accuracy_score(y_pred,Y_test)

In [None]:
accuracies = []

In [None]:
X_temp = X_train_bow.toarray()
Y_temp = Y_train.to_numpy()
X_temp_test = X_test_bow.toarray()
Y_temp_test = Y_test.to_numpy()

In [None]:
## LOGISTIC REGRESSION
## 0.652
## 0.732 (simple)
## 0.692 (count vectorizer)
## 0.732 (first stopword removal then lemmatize then countvectorize)
accuracy_logistic = LogisticClassifier(X_train_bow,Y_train,X_test_bow,Y_test)
print(accuracy_logistic)
accuracies.append(accuracy_logistic)

## IMPROVING LR-MODEL

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
solvers = ['lbfgs' , 'liblinear' , 'newton-cg','sag','saga']
penalty = ['l1', 'l2','elasticnet', 'none']
max_iter = [100, 1000,2500, 5000]
c_values = [0.01 , 0.1 , 0.45 , 0.5  , 0.65 , 1 , 1.5 , 10 , 100 , 1000]
clf = LogisticRegression()

params_grid = {'solver' : solvers , 'penalty' : penalty , 'C' : c_values , 'max_iter' : max_iter}

gS = GridSearchCV(clf , param_grid = params_grid , scoring = 'accuracy', n_jobs = -1)
gS_ = gS.fit(X_train_bow,Y_train)

In [None]:
y_pred = gS_.predict(X_test_bow)

In [None]:
gS_.best_params_

In [None]:
accuracy_score(y_pred , Y_test)

In [None]:
## Decision tree classifier
## 0.64
## 0.632 (simple)
## 0.664 (cv)
accuracy_dt = DecisionTree(X_train_bow,Y_train,X_test_bow,Y_test)
print(accuracy_dt)
accuracies.append(accuracy_dt)

In [None]:
## random forest classifier
## 0.636
## 0.656 (simple)
## 0.62 (cv)
accuracy_random_forest = RandomForest(X_train_bow,Y_train,X_test_bow,Y_test)
print(accuracy_random_forest)
accuracies.append(accuracy_random_forest)

In [None]:
## gaussian naive classifier
## 0.552 (preprocessed own)
## 0.54 (simple)
## 0.52
accuracy_gaussian = GaussianNaiveClassifier(X_temp,Y_temp,X_temp_test,Y_temp_test)
print(accuracy_gaussian)
accuracies.append(accuracy_gaussian)

In [None]:
## multinomial naive classifier
## 0.624
## 0.676 (simple)
accuracy_multinomial = MultinomialNaiveClassifier(X_temp,Y_temp,X_temp_test,Y_temp_test)
print(accuracy_multinomial)
accuracies.append(accuracy_multinomial)

In [None]:
accuracies_tfidf = []

In [None]:
## changes to be done from here
## ----------------------------------
Y_tf = data['IsToxic']
X_tf = data['LEMM']

X_train_tf,X_test_tf,Y_train_tf,Y_test_tf = train_test_split(X_tf,Y_tf,random_state = 1)

vectorizer_tf = TfidfVectorizer(lowercase=True,stop_words='english')
vectorizer_tf.fit(X_train_tf)
X_train_tf = vectorizer.transform(X_train_tf)
X_test_tf = vectorizer.transform(X_test_tf)

In [None]:
X_temp_tf = X_train_tf.toarray()
Y_temp_tf = Y_train_tf.to_numpy()
X_temp_test_tf = X_test_tf.toarray()
Y_temp_test_tf = Y_test_tf.to_numpy()

In [None]:
## logistic classifier tf-idf
## 0.668 (pre)
## 0.732 (simple)
## 0.692 (tf)
## 0.696 (stopword->lemmatize->tf-idf)
accuracy_logistic_tf = LogisticClassifier(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
print(accuracy_logistic_tf)
accuracies_tfidf.append(accuracy_logistic_tf)

In [None]:
solvers = ['lbfgs' , 'liblinear' , 'newton-cg','sag','saga']
penalty = ['l1', 'l2','elasticnet', 'none']
max_iter = [100, 1000,2500, 5000]
c_values = [0.01 , 0.1 , 0.45 , 0.5  , 0.65 , 1 , 1.5 , 10 , 100 , 1000]
clf = LogisticRegression()

params_grid = {'solver' : solvers , 'penalty' : penalty , 'C' : c_values , 'max_iter' : max_iter}

gS = GridSearchCV(clf , param_grid = params_grid , scoring = 'accuracy', n_jobs = -1)
gS_ = gS.fit(X_temp_tf,Y_temp_tf)

In [None]:
## random forest classifier tf-idf
## 0.628 (tf)
## 0.648 (stopword->lemmatize->tf-idf)
accuracy_random_forest_tf = RandomForest(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
print(accuracy_random_forest_tf)
accuracies_tfidf.append(accuracy_random_forest_tf)

In [None]:
## Decision tree classifier tf-idf
## 0.664 (tf)
## 0.672 (stopword->lemmatize->tf-idf)
accuracy_dt_tf = DecisionTree(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
print(accuracy_dt_tf)
accuracies_tfidf.append(accuracy_dt_tf)

In [None]:
## gaussian naive classifier tf-idf
## 0.52(tf)
## 0.532 (stopword->lemmatize->tf-idf)
accuracy_gaussian_tf = GaussianNaiveClassifier(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
print(accuracy_gaussian_tf)
accuracies_tfidf.append(accuracy_gaussian_tf)

In [None]:
## multinomial naive classifier tf-idf
## 0.676 (simple)
## 0.676 (tf)
## 0.664 (stopword->lemmatize->tf-idf)
accuracy_multinomial_tf = MultinomialNaiveClassifier(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
print(accuracy_multinomial_tf)
accuracies_tfidf.append(accuracy_multinomial_tf)

In [None]:
classes = data.select_dtypes(include= ["bool"]).columns

In [None]:
data_bow = []
data_tfidf = []
columns = ['Label'   , 'Logistic Regression' , 'Decision Tree' , 'Random Forest' , 'Gaussian NB' , 'Multinomial NB']

In [None]:
for i in range(len(classes)):
    
    # BOW 
    accuracies = [classes[i]]
    Y = data[classes[i]]
    X = data['LEMM']
    X_train_bow, X_test_bow, Y_train , Y_test = train_test_split(X,Y)
    
    vectorizer = CountVectorizer()
    vectorizer.fit(X_train_bow)
    X_train_bow = vectorizer.transform(X_train_bow)
    X_test_bow = vectorizer.transform(X_test_bow)
    
    X_temp = X_train_bow.toarray()
    Y_temp = Y_train.to_numpy()
    X_temp_test = X_test_bow.toarray()
    Y_temp_test = Y_test.to_numpy()
    
    ## logistic Regression
    accuracy_logistic = LogisticClassifier(X_train_bow,Y_train,X_test_bow,Y_test)
    accuracies.append(accuracy_logistic)
    
    ## Decision Tree
    accuracy_dt = DecisionTree(X_train_bow,Y_train,X_test_bow,Y_test)
    accuracies.append(accuracy_dt)
    
    ## Random Forest
    accuracy_random_forest = RandomForest(X_train_bow,Y_train,X_test_bow,Y_test)
    accuracies.append(accuracy_random_forest)
    
    ## Gaussian NB
    accuracy_gaussian = GaussianNaiveClassifier(X_temp,Y_temp,X_temp_test,Y_temp_test)
    accuracies.append(accuracy_gaussian)
    
    ## Multinomial NB
    accuracy_multinomial = MultinomialNaiveClassifier(X_temp,Y_temp,X_temp_test,Y_temp_test)
    accuracies.append(accuracy_multinomial)
    
    
    # TF-IDF
    accuracies_tfidf = [classes[i]]
    
    Y_tf = data[classes[i]]
    X_tf = data['LEMM']

    X_train_tf,X_test_tf,Y_train_tf,Y_test_tf = train_test_split(X_tf,Y_tf,random_state = 1)

    vectorizer_tf = TfidfVectorizer(lowercase=True,stop_words='english')
    vectorizer_tf.fit(X_train_tf)
    X_train_tf = vectorizer.transform(X_train_tf)
    X_test_tf = vectorizer.transform(X_test_tf)
    
    X_temp_tf = X_train_tf.toarray()
    Y_temp_tf = Y_train_tf.to_numpy()
    X_temp_test_tf = X_test_tf.toarray()
    Y_temp_test_tf = Y_test_tf.to_numpy()
    
    ## logistic Regression
    accuracy_logistic_tf = LogisticClassifier(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
    accuracies_tfidf.append(accuracy_logistic_tf)
    
    ## Decision Tree
    accuracy_dt_tf = DecisionTree(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
    accuracies_tfidf.append(accuracy_dt_tf)
    
    ## Random Forest
    accuracy_random_forest_tf = RandomForest(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
    accuracies_tfidf.append(accuracy_random_forest_tf)
    
    ## Gaussian NB
    accuracy_gaussian_tf = GaussianNaiveClassifier(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
    accuracies_tfidf.append(accuracy_gaussian_tf)
    
    ## Multinomial NB
    accuracy_multinomial_tf = MultinomialNaiveClassifier(X_temp_tf,Y_temp_tf,X_temp_test_tf,Y_temp_test_tf)
    accuracies_tfidf.append(accuracy_multinomial_tf)
    
    
    data_bow.append(accuracies)
    data_tfidf.append(accuracies_tfidf)

In [None]:
data_bow

In [None]:
data_tfidf

In [None]:
df_bow = pd.DataFrame(data_bow , columns= columns)

In [None]:
df_tfidf = pd.DataFrame(data_tfidf , columns= columns)

In [None]:
df_bow

In [None]:
df_tfidf

In [None]:
check = pd.read_csv('out.csv')

In [None]:
check