In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
toxic = pd.read_csv('../data/toxicity_cleaned.csv')

In [3]:
toxic.head()

Unnamed: 0,rev_id,comment,toxicity
0,2232,This One can make an analogy in mathematical ...,0
1,4216,"Clarification for you (and Zundark's right, i...",0
2,8953,Elected or Electoral? JHK,0
3,26547,This is such a fun entry. DevotchkaI once ha...,0
4,28959,Please relate the ozone hole to increases in c...,0


In [4]:
toxic['toxicity'].value_counts(normalize=True)

0    0.884373
1    0.115627
Name: toxicity, dtype: float64

In [5]:
def split_data(data, pct_positive, test_size=None, train_size=None,
random_state=None):
    """
    A custom train_test_split implementation that creates a training dataset
    with a specific proportion of positive classes. This is meant to
    combat class imbalance in training data ONLY. Testing data will retain the
    original class proportions.
    Args:
        data -- the dataframe acquired from merge_with_target()
        pct_positive -- desired proportion of positive classes in training data.
            (float between 0 and 1)
        test_size -- number of samples to include in the testing dataset.
            (integer, default None --> 20% of the full dataset)
        train_size -- number of samples to include in the training dataset.
            (integer, default None --> maximum training size for given
            parameters)
    """
    # get test data
    if test_size:
        num_test_samples = int(test_size)
    else:
        num_test_samples = int(0.3 * data.shape[0])

    df_test = data.sample(num_test_samples, random_state=random_state)
    X_test = df_test['comment']
    y_test = df_test['toxicity']

    # get train data
    # separate positive and negative classes
    df_train = data.drop(df_test.index)
    df_train_pos = df_train[df_train['toxicity'] == 1]
    df_train_neg = df_train[df_train['toxicity'] == 0]

    # some input validation for train_size
    max_train_size = int(df_train_pos.shape[0] / pct_positive)
    if not train_size:
        train_size = max_train_size
    elif train_size > max_train_size:
        warnings.warn(f'train_size of {train_size} exceeds the amount of '
        'training data available for the given parameters. '
        f'Resetting train size to {max_train_size}')
        train_size = max_train_size
    else:
        train_size = int(train_size)

    # assemble train dataframe
    num_pos_samples = int(pct_positive * train_size)
    num_neg_samples = train_size - num_pos_samples

    df_train = pd.concat([
        df_train_pos.sample(num_pos_samples, random_state=random_state),
        df_train_neg.sample(num_neg_samples)]).sample(
        frac=1, random_state=random_state)

    X_train = df_train['comment']
    y_train = df_train['toxicity']

    return (X_train, X_test, y_train, y_test)

In [6]:
# Create custom train/test split using function Christian created to create equal classes

X_train, X_test, y_train, y_test = split_data(data = toxic, pct_positive = 0.5)

In [7]:
# Create a dataframe to store model results

column_names = ['model', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'training_score', 'test_score']

model_df = pd.DataFrame(columns = column_names)

# Create function to add results to model df

def add_to_model(df, model, name):
    df = df.append({'model': name,
                'cv__max_df': model.best_params_['cv__max_df'],
                'cv__max_features': model.best_params_['cv__max_features'],
                'cv__min_df': model.best_params_['cv__min_df'],
                'training_score': model.score(X_train, y_train),
                'test_score': model.score(X_test, y_test)},
                ignore_index = True)
    return df

In [8]:
# Create a pipeline to count vectorize and run logistic regression

pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000))
]
)

In [10]:
# Create parameters for pipeline

pipe_params = {
    'cv__max_features': [3_000, 5_000, 8_000, 10_000, 12_000],
    'cv__min_df'      : [2, 3],
    'cv__max_df'      : [.9, .95]
}

In [11]:
# Instantiate gridsearch

gs = GridSearchCV(
    estimator  = pipe,
    param_grid = pipe_params,
    cv = 5
)

In [12]:
# Fit the gridsearch

gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=1000))]),
             param_grid={'cv__max_df': [0.9, 0.95],
                         'cv__max_features': [3000, 5000, 8000, 10000, 12000],
                         'cv__min_df': [2, 3]})

In [13]:
model_df = add_to_model(model_df, gs, 'logistic regression - toxic')

In [14]:
model_df

Unnamed: 0,model,cv__max_df,cv__max_features,cv__min_df,training_score,test_score
0,logistic regression - toxic,0.9,10000,3,0.962645,0.898008


In [15]:
model_df.to_csv('../data/lr_model_results.csv', index = False, mode = 'a')