In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
attacks = pd.read_csv('../data/attack_clean.csv')

In [3]:
attacks.head()

Unnamed: 0,rev_id,comment,target
0,37675,This is not creative Those are the dictionar...,0
1,44816,the term standard model is itself less NPOV...,0
2,49851,True or false the situation as of March 2002...,0
3,89320,Next maybe you could work on being less conde...,0
4,93890,This page will need disambiguation,0


In [4]:
attacks['target'].value_counts(normalize=True)

0    0.865338
1    0.134662
Name: target, dtype: float64

In [5]:
def split_data(data, pct_positive, test_size=None, train_size=None,
random_state=None):
    """
    A custom train_test_split implementation that creates a training dataset
    with a specific proportion of positive classes. This is meant to
    combat class imbalance in training data ONLY. Testing data will retain the
    original class proportions.
    Args:
        data -- the dataframe acquired from merge_with_target()
        pct_positive -- desired proportion of positive classes in training data.
            (float between 0 and 1)
        test_size -- number of samples to include in the testing dataset.
            (integer, default None --> 20% of the full dataset)
        train_size -- number of samples to include in the training dataset.
            (integer, default None --> maximum training size for given
            parameters)
    """
    # get test data
    if test_size:
        num_test_samples = int(test_size)
    else:
        num_test_samples = int(0.3 * data.shape[0])

    df_test = data.sample(num_test_samples, random_state=random_state)
    X_test = df_test['comment']
    y_test = df_test['target']

    # get train data
    # separate positive and negative classes
    df_train = data.drop(df_test.index)
    df_train_pos = df_train[df_train['target'] == 1]
    df_train_neg = df_train[df_train['target'] == 0]

    # some input validation for train_size
    max_train_size = int(df_train_pos.shape[0] / pct_positive)
    if not train_size:
        train_size = max_train_size
    elif train_size > max_train_size:
        warnings.warn(f'train_size of {train_size} exceeds the amount of '
        'training data available for the given parameters. '
        f'Resetting train size to {max_train_size}')
        train_size = max_train_size
    else:
        train_size = int(train_size)

    # assemble train dataframe
    num_pos_samples = int(pct_positive * train_size)
    num_neg_samples = train_size - num_pos_samples

    df_train = pd.concat([
        df_train_pos.sample(num_pos_samples, random_state=random_state),
        df_train_neg.sample(num_neg_samples)]).sample(
        frac=1, random_state=random_state)

    X_train = df_train['comment']
    y_train = df_train['target']

    return (X_train, X_test, y_train, y_test)

In [6]:
# Create custom train/test split using function Christian created to create equal classes

X_train, X_test, y_train, y_test = split_data(data = attacks, pct_positive = 0.5)

In [7]:
# Create a dataframe to store model results

column_names = ['model', 'file', 'cv__stop_words', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'cv__ngram_range', 'training_score', 'test_score']

model_df = pd.DataFrame(columns = column_names)

# Create function to add results to model df

def add_to_model(df, model, name, file):
    df = df.append({'model'           : name,
                    'file'            : file,
                    'cv__stop_words'  : model.best_params_['cv__stop_words'],
                    'cv__max_df'      : model.best_params_['cv__max_df'],
                    'cv__max_features': model.best_params_['cv__max_features'],
                    'cv__min_df'      : model.best_params_['cv__min_df'],
                    'cv__ngram_range' : model.best_params_['cv__ngram_range'],
                    'training_score'  : model.score(X_train, y_train),
                    'test_score'      : model.score(X_test, y_test)},
                    ignore_index = True)
    return df

In [8]:
# Create a pipeline to count vectorize and run logistic regression

pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter=1000))
]
)

In [19]:
# Create parameters for pipeline

pipe_params = {
    'cv__stop_words'  : [None, 'english'],
    'cv__max_features': [11_000, 12_000, 13_000],
    'cv__min_df'      : [2, 3],
    'cv__max_df'      : [.80, .85],
    'cv__ngram_range' : [(1, 1), (2, 2)]
}

In [20]:
# Instantiate gridsearch

gs = GridSearchCV(
    estimator  = pipe,
    param_grid = pipe_params,
    cv = 5,
)

In [21]:
# Fit the gridsearch

gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=1000))]),
             param_grid={'cv__max_df': [0.8, 0.85],
                         'cv__max_features': [11000, 12000, 13000],
                         'cv__min_df': [2, 3],
                         'cv__ngram_range': [(1, 1), (2, 2)],
                         'cv__stop_words': [None, 'english']})

In [22]:
model_df = add_to_model(model_df, gs, 'logistic regression', 'attacks')

In [23]:
model_df

Unnamed: 0,model,file,cv__stop_words,cv__max_df,cv__max_features,cv__min_df,cv__ngram_range,training_score,test_score
0,logistic regression,attacks,,0.9,10000,2,"(1, 1)",0.956063,0.883812
1,logistic regression,attacks,,0.85,12000,2,"(1, 1)",0.960635,0.884822
2,logistic regression,attacks,,0.8,12000,2,"(1, 1)",0.960635,0.884822


In [24]:
model_df.to_csv('../data/lr_model_results.csv', index = False, mode = 'a', header = None)