# Baseline Model: Logistic Regression

In [1]:
#workflow and statistics
import pandas as pd
import numpy as np
import matplotlib as matplotlib
import matplotlib.pyplot as plt

#for showing missing values
import missingno as msno

#visualisation
import seaborn as sns

#natural language processing toolkit
import nltk
import string
from nltk.corpus import stopwords  # removes useless words
from nltk.stem.lancaster import LancasterStemmer  #converts the words to base form; aggressive
from nltk.stem import porter
from nltk.stem.util import suffix_replace, prefix_replace
from nltk.stem.api import StemmerI
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

#accessing google cloud storage
#from google.cloud import storage
#from io import BytesIO
#client = storage.Client()
#bucket = "bilderkennung_nf_2020"

#building baseline classifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

#evaluation metrics

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score, recall_score, roc_curve
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate

# Grid search cross validation
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train = pd.read_csv('./jigsaw-toxic-comment-train.csv')

In [3]:
train.shape

(223549, 8)

## Preprocessing

In [6]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
train['comment_text'] = train['comment_text'].apply(remove_punctuation)

In [7]:
sw=stopwords.words('english')
def removesw(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)
train['comment_text'] = train['comment_text'].apply(removesw)

In [8]:
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 
train['comment_text'] = train['comment_text'].apply(stemming)

Logistic Regression is [well suited](https://blog.insightdatascience.com/always-start-with-a-stupid-model-no-exceptions-3a22314b9aaa) as a baseline model for classification and natural language processing. Baseline models take less tims to construct since there architecture is relatively simple. They are easier and faster to train, so you can iterate quickly through them. This advantage helps to deal with bugs that point to data issues, as well.
Hence, they give you information to build on in a short time. Baseline models function as benchmarks for more complex models. By studying the shortcomings and struggles of our baseline model we can make decisions on what complex model to deploy next. Therefore, the baseline model also gives a methodological orientation. 

**Verctorize**

In [9]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(train.comment_text)
vectorizer.vocabulary_

{'explan': 88396,
 'edit': 81549,
 'made': 152874,
 'usernam': 254756,
 'hardcor': 108116,
 'metallica': 159534,
 'fan': 89960,
 'revert': 205028,
 'werent': 263330,
 'vandal': 256984,
 'closur': 59454,
 'gas': 99293,
 'vote': 260309,
 'new': 170572,
 'york': 275297,
 'doll': 77451,
 'fac': 89129,
 'pleas': 189250,
 'dont': 77738,
 'remov': 203483,
 'templat': 238821,
 'talk': 233306,
 'page': 182321,
 'sinc': 219400,
 'im': 125883,
 'retir': 204811,
 'now892053827': 175196,
 'daww': 69837,
 'match': 156091,
 'background': 37614,
 'colour': 60798,
 'seem': 214187,
 'stuck': 228505,
 'thank': 240439,
 '2151': 9420,
 'januari': 134314,
 '11': 2217,
 '2016': 8578,
 'utc': 256092,
 'hey': 110914,
 'man': 154213,
 'realli': 201148,
 'tri': 246454,
 'war': 261476,
 'guy': 106097,
 'constant': 63031,
 'relev': 203167,
 'inform': 129541,
 'instead': 130323,
 'care': 52267,
 'format': 95277,
 'actual': 22613,
 'info': 129465,
 'cant': 51839,
 'make': 153683,
 'real': 201076,
 'suggest': 229888,

In [10]:
vectorizer.transform(train.comment_text).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

**Train Test Split**

In [11]:
X = train['comment_text']#.values
y = train['toxic']#.values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1000)

# shape of train and test splits
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((167661,), (55888,), (167661,), (55888,))

In [15]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test  = vectorizer.transform(X_test)
X_train

<167661x240435 sparse matrix of type '<class 'numpy.int64'>'
	with 4462743 stored elements in Compressed Sparse Row format>

**Baseline Model Logistic Regression**

In [16]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.9491125107357572


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [17]:
preds = classifier.predict(X_test)

In [18]:
print("Accuracy:", accuracy_score(y_test, preds))
print("Precision:", precision_score(y_test, preds))
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

Accuracy: 0.9491125107357572
Precision: 0.7913915382734165
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     50661
           1       0.79      0.62      0.69      5227

    accuracy                           0.95     55888
   macro avg       0.88      0.80      0.83     55888
weighted avg       0.95      0.95      0.95     55888

[[49808   853]
 [ 1991  3236]]
