# Logistic Regression and different pre-process techniques

In this notebook the aim is to try different pre-process techniques and NLP models with Logistic regression classification

## Set up

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix
import spacy # (object oriented)
import nltk # natural language tool kit (string oriented)
from nltk.tokenize import word_tokenize
from nltk import ngrams
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')

#uncomment and Run this lines only once
#!python -m spacy download en_core_web_lg


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eric_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\eric_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Getting data ready

In [2]:
data = pd.read_csv('data/data_usampl_60_40_cleaned.csv') #nrows only to get the fist 500 rows in the data for speed up testing
#please remove nrows parameter if you want to try with the complete data set

In [3]:
df_train = data[['raw', 'toxic']]

In [4]:
df_train.head()

Unnamed: 0,raw,toxic
0,Trudeau with a brain? I assume you are taking...,1
1,The Jones Act was immediately lifted to help T...,1
2,As long as the Church keeps preventing the Lor...,0
3,"Climate change, in the sense discussed in the ...",0
4,Fake news...now she is lying. figures....she i...,1


## Split Train and Test

In [5]:
#split the data in train and test

X_train, X_test, y_train, y_test = train_test_split(df_train['raw'], df_train['toxic'], random_state=42)

## Function to record different models performance

In [6]:
# initialize dataframe that will include the results
results_table = pd.DataFrame()


In [7]:
def evaluate_model(model, X_train,y_train,X_test,y_test,results_df,model_name="", parameters='', comments=''):
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    predict_probab = model.predict_proba(X_test)[:,1]
    duration = time.time() - start_time
    duration_format = f"{int(duration // 60)} minutes and {round(duration % 60, 2)} seconds"

    # Calculating all metrics

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    conf_matrix = str(confusion_matrix(y_test, predictions))

    # Create a dictionary including the results
    results = {
        'Name': model_name if model_name else model.__class__.__name__,
        'Parameters': parameters,
        'F1-Score': f1,
        'AUC-ROC': roc_auc,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Training Time': duration_format,
        'Comments': comments
    }

    # Convert the dictionary to a DataFrame
    new_row_df = pd.DataFrame([results])
    # don't forget to append the result to the results dataframe
    results_df = pd.concat([results_df, new_row_df], ignore_index=True)

    return results_df

## Bag of Words (Baseline)

In [8]:
#Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

# Prepare X_train for the function, transforming the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)
# Prepare X_test for the function
X_test_vectorized = vect.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=1500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="", comments="Baseline" )


## Bag of Words(Binary)

In [9]:
#Fit the CountVectorizer to the training data
vect = CountVectorizer(binary=True).fit(X_train)

# Prepare X_train for the function, transforming the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)
# Prepare X_test for the function
X_test_vectorized = vect.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=1500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="binary", comments="Bag of words - Binary" )

## Bag of Words (Binary + Stop Words)

In [10]:
stop_words = set(stopwords.words('english'))                       
# stop_words contains a list of 179 words that we want to remove from our comments

In [11]:
#Fit the CountVectorizer to the training data
vect = CountVectorizer(binary=True, stop_words=list(stop_words)).fit(X_train)

# Prepare X_train for the function, transforming the different comments in the training data to a sparse matrix
X_train_vectorized = vect.transform(X_train)
# Prepare X_test for the function
X_test_vectorized = vect.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=1500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_vectorized, y_train, X_test_vectorized, y_test,results_table, parameters="binary,stopwords", comments="Bag of words - Binary/StopWords" )

# TF - IDF + LogisticRegression

In [12]:
# Initialize the TfidfVectorizer with min_df
tfidf_vect = TfidfVectorizer(min_df=30)

# Prepare X_train for the function
X_train_tfidf = tfidf_vect.fit_transform(X_train)

# Prepare X_test for the function
X_test_tfidf = tfidf_vect.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=1500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_tfidf, y_train, X_test_tfidf, y_test,results_table, parameters="min_df=30", comments="TfidfVectorizer" )


## Preprocess techniques

### Stemming(Bag of words) + LogisticRegression

In [13]:

# Initializing stemmer and countvectorizer 
stemmer = nltk.PorterStemmer()
cv_analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function 
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test, results_table, parameters="", comments="Stemming_cv")

### Stemming(Bag of words(stopwords))

In [14]:

# Initializing stemmer and countvectorizer with Stop Words
stemmer = nltk.PorterStemmer()
cv_analyzer = CountVectorizer(stop_words=list(stop_words)).build_analyzer()

def stemmed_words(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function 
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test,results_table, parameters="stopwords", comments="Stemming_cv")



### Stemming with TF - IDF and stopwords

In [15]:
stop_words = set(stopwords.words('english'))
                         
# stop_words contains a list of 179 words that we want to remove from our comments

# Initializing stemmer and countvectorizer with Stop Words
stemmer = nltk.PorterStemmer()
tfidf_analyzer = TfidfVectorizer(min_df=30, stop_words=list(stop_words)).build_analyzer()

def stemmed_words(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (stemmer.stem(w) for w in cv_analyzer(doc))

# define CountVectorizer with stemming function 
stem_vectorizer = CountVectorizer(analyzer = stemmed_words)

# Prepare X_train for the function
X_train_stem_vectorized = stem_vectorizer.fit_transform(X_train)

# Prepare X_test for the function
X_test_stem_vectorized = stem_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_stem_vectorized, y_train, X_test_stem_vectorized, y_test,results_table, parameters="min_df=30, stopwords", comments="Stemming_tfidf")


### Lemmatization with Bag of Words

In [16]:
# Initialization
WNlemma = nltk.WordNetLemmatizer()
cv_analyzer = CountVectorizer().build_analyzer()

def lemmatize_word(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (WNlemma.lemmatize(t) for t in cv_analyzer(doc))

# define CountVectorizer with Lemmatization function 
lemm_vectorizer = CountVectorizer(analyzer = lemmatize_word)

# Prepare X_train for the function
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)
# Prepare X_test for the function
X_test_lemm_vectorized  = lemm_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_lemm_vectorized, y_train, X_test_lemm_vectorized, y_test,results_table, parameters="", comments="lemmatization_cv")



### Lemmatization with TF-IDF

In [17]:
# Initialization
WNlemma = nltk.WordNetLemmatizer()
cv_analyzer = TfidfVectorizer(min_df=30).build_analyzer()

def lemmatize_word(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (WNlemma.lemmatize(t) for t in cv_analyzer(doc))

# define CountVectorizer with Lemmatization function 
lemm_vectorizer = CountVectorizer(analyzer = lemmatize_word)

# Prepare X_train for the function
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)
# Prepare X_test for the function
X_test_lemm_vectorized  = lemm_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_lemm_vectorized, y_train, X_test_lemm_vectorized, y_test,results_table, parameters="min_df=30", comments="lemmatization_tfidf")


### Lemmatization with Stopwords

In [18]:

# Initialization

stop_words = set(stopwords.words('english'))

WNlemma = nltk.WordNetLemmatizer()
cv_analyzer = CountVectorizer(stop_words=list(stop_words)).build_analyzer()

def lemmatize_word(doc):
    ''' 
    In this function the text is first passed through the build_analyzer() and then each word in the text is stemmed to its base form
    '''
    return (WNlemma.lemmatize(t) for t in cv_analyzer(doc))

# define CountVectorizer with Lemmatization function 
lemm_vectorizer = CountVectorizer(analyzer = lemmatize_word)

# Prepare X_train for the function
X_train_lemm_vectorized = lemm_vectorizer.fit_transform(X_train)
# Prepare X_test for the function
X_test_lemm_vectorized  = lemm_vectorizer.transform(X_test)

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_lemm_vectorized, y_train, X_test_lemm_vectorized, y_test,results_table, parameters="stopwords", comments="lemmatization_cv")

## Word Vectors - Spacy library - Large

In [19]:
# This initialize a pre-trained model (the large version) that uses Neural Networks to build word vectors
nlp = spacy.load("en_core_web_lg")

# convert words into vectors and Prepare X_train for the function
docs = [nlp(text) for text in X_train]
X_train_word_vectors = [x.vector for x in docs]

# Prepare X_test for the function
docs_test = [nlp(text) for text in X_test]
X_test_word_vectors = [x.vector for x in docs_test]

# Initialize the model you want to try
model = LogisticRegression(max_iter=2500)

# Call the function and store the row in the variable result
results_table = evaluate_model(model, X_train_word_vectors, y_train, X_test_word_vectors, y_test,results_table, parameters="", comments="word_vectors_spacy_lg")

## Results

In [20]:
results_table

Unnamed: 0,Name,Parameters,F1-Score,AUC-ROC,Precision,Recall,Accuracy,Confusion Matrix,Training Time,Comments
0,LogisticRegression,,0.463768,0.625071,0.615385,0.372093,0.704,[[72 10]\n [27 16]],0 minutes and 0.11 seconds,Baseline
1,LogisticRegression,binary,0.457143,0.618973,0.592593,0.372093,0.696,[[71 11]\n [27 16]],0 minutes and 0.06 seconds,Bag of words - Binary
2,LogisticRegression,"binary,stopwords",0.474576,0.650596,0.875,0.325581,0.752,[[80 2]\n [29 14]],0 minutes and 0.03 seconds,Bag of words - Binary/StopWords
3,LogisticRegression,min_df=30,0.214286,0.527085,0.461538,0.139535,0.648,[[75 7]\n [37 6]],0 minutes and 0.01 seconds,TfidfVectorizer
4,LogisticRegression,,0.465753,0.618406,0.566667,0.395349,0.688,[[69 13]\n [26 17]],0 minutes and 0.08 seconds,Stemming_cv
5,LogisticRegression,stopwords,0.477612,0.637266,0.666667,0.372093,0.72,[[74 8]\n [27 16]],0 minutes and 0.12 seconds,Stemming_cv
6,LogisticRegression,"min_df=30, stopwords",0.477612,0.637266,0.666667,0.372093,0.72,[[74 8]\n [27 16]],0 minutes and 0.04 seconds,Stemming_tfidf
7,LogisticRegression,,0.478873,0.630601,0.607143,0.395349,0.704,[[71 11]\n [26 17]],0 minutes and 0.07 seconds,lemmatization_cv
8,LogisticRegression,min_df=30,0.478873,0.630601,0.607143,0.395349,0.704,[[71 11]\n [26 17]],0 minutes and 0.06 seconds,lemmatization_tfidf
9,LogisticRegression,stopwords,0.412698,0.60848,0.65,0.302326,0.704,[[75 7]\n [30 13]],0 minutes and 0.03 seconds,lemmatization_cv
