In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
aggression = pd.read_csv('../data/aggression_clean_data.csv')

In [3]:
aggression.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split,aggression,aggression_score,label
0,37675,This is not creative Those are the dictionary...,2002,True,article,random,train,0.1,0.0,0
1,44816,the term standard model is itself less NPOV t...,2002,True,article,random,train,0.0,0.111111,0
2,49851,True or false the situation as of March 2002 w...,2002,True,article,random,train,0.0,0.1,0
3,89320,Next maybe you could work on being less conde...,2002,True,article,random,dev,0.444444,-0.444444,0
4,93890,This page will need disambiguation,2002,True,article,random,train,0.0,0.333333,0


In [4]:
aggression['label'].value_counts(normalize=True)

0    0.852805
1    0.147195
Name: label, dtype: float64

In [5]:
def split_data(data, pct_positive, test_size=None, train_size=None,
random_state=None):
    """
    A custom train_test_split implementation that creates a training dataset
    with a specific proportion of positive classes. This is meant to
    combat class imbalance in training data ONLY. Testing data will retain the
    original class proportions.
    Args:
        data -- the dataframe acquired from merge_with_target()
        pct_positive -- desired proportion of positive classes in training data.
            (float between 0 and 1)
        test_size -- number of samples to include in the testing dataset.
            (integer, default None --> 20% of the full dataset)
        train_size -- number of samples to include in the training dataset.
            (integer, default None --> maximum training size for given
            parameters)
    """
    # get test data
    if test_size:
        num_test_samples = int(test_size)
    else:
        num_test_samples = int(0.3 * data.shape[0])

    df_test = data.sample(num_test_samples, random_state=random_state)
    X_test = df_test['comment']
    y_test = df_test['label']

    # get train data
    # separate positive and negative classes
    df_train = data.drop(df_test.index)
    df_train_pos = df_train[df_train['label'] == 1]
    df_train_neg = df_train[df_train['label'] == 0]

    # some input validation for train_size
    max_train_size = int(df_train_pos.shape[0] / pct_positive)
    if not train_size:
        train_size = max_train_size
    elif train_size > max_train_size:
        warnings.warn(f'train_size of {train_size} exceeds the amount of '
        'training data available for the given parameters. '
        f'Resetting train size to {max_train_size}')
        train_size = max_train_size
    else:
        train_size = int(train_size)

    # assemble train dataframe
    num_pos_samples = int(pct_positive * train_size)
    num_neg_samples = train_size - num_pos_samples

    df_train = pd.concat([
        df_train_pos.sample(num_pos_samples, random_state=random_state),
        df_train_neg.sample(num_neg_samples)]).sample(
        frac=1, random_state=random_state)

    X_train = df_train['comment']
    y_train = df_train['label']

    return (X_train, X_test, y_train, y_test)

In [6]:
# Create custom train/test split using function Christian created to create equal classes

X_train, X_test, y_train, y_test = split_data(data = aggression, pct_positive = 0.5)

In [32]:
# Create a dataframe to store model results

column_names = ['model', 'file', 'cv__stop_words', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'training_score', 'test_score']

model_df = pd.DataFrame(columns = column_names)

# Create function to add results to model df

def add_to_model(df, model, name, file):
    df = df.append({'model'           : name,
                    'file'            : file,
                    'cv__stop_words'  : model.best_params_['cv__stop_words'],
                    'cv__max_df'      : model.best_params_['cv__max_df'],
                    'cv__max_features': model.best_params_['cv__max_features'],
                    'cv__min_df'      : model.best_params_['cv__min_df'],
                    'cv__ngram_range' : model.best_params_['cv__ngram_range'],
                    'training_score'  : model.score(X_train, y_train),
                    'test_score'      : model.score(X_test, y_test)},
                    ignore_index = True)
    return df

In [7]:
# Create a pipeline to count vectorize and run logistic regression

pipe = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression(max_iter = 1_000))
]
)

In [8]:
# Create parameters for pipeline

pipe_params = {
    'cv__stop_words'  : [None, 'english'],
    'cv__max_features': [12_000, 15_000],
    'cv__min_df'      : [2, 3],
    'cv__max_df'      : [.85, .9],
    'cv__ngram_range' : [(1, 1), (2, 2)]
}

In [9]:
# Instantiate gridsearch

gs = GridSearchCV(
    estimator  = pipe,
    param_grid = pipe_params,
    cv = 5
)

In [10]:
# Fit the gridsearch

gs.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('lr',
                                        LogisticRegression(max_iter=1000))]),
             param_grid={'cv__max_df': [0.85, 0.9],
                         'cv__max_features': [12000, 15000],
                         'cv__min_df': [2, 3],
                         'cv__ngram_range': [(1, 1), (2, 2)],
                         'cv__stop_words': [None, 'english']})

In [61]:
model_df = add_to_model(model_df, gs, 'logistic regression', 'aggression')

In [62]:
model_df

Unnamed: 0,model,file,cv__stop_words,cv__max_df,cv__max_features,cv__min_df,training_score,test_score,cv__ngram_range
0,logistic regression,aggression,,0.9,12000,2,0.953827,0.869772,"(1, 1)"
1,logistic regression,aggression,,0.9,12000,2,0.953827,0.869772,"(1, 1)"
2,logistic regression,aggression,,0.85,15000,3,0.959457,0.871129,"(1, 1)"


In [64]:
model_df = model_df[['model', 'file', 'cv__stop_words', 'cv__max_df', 'cv__max_features', 'cv__min_df', 'cv__ngram_range', 'training_score', 'test_score']]

In [65]:
model_df.to_csv('../data/lr_model_results.csv', index = False)

In [66]:
results = pd.read_csv('../data/lr_model_results.csv')

#### Results of gridsearch on the different models

In [67]:
results

Unnamed: 0,model,file,cv__stop_words,cv__max_df,cv__max_features,cv__min_df,cv__ngram_range,training_score,test_score
0,logistic regression,aggression,,0.9,12000,2,"(1, 1)",0.953827,0.869772
1,logistic regression,aggression,,0.9,12000,2,"(1, 1)",0.953827,0.869772
2,logistic regression,aggression,,0.85,15000,3,"(1, 1)",0.959457,0.871129
3,logistic regression,toxic,,0.9,12000,3,"(1, 1)",0.965972,0.896686
4,logistic regression,toxic,,0.85,12000,3,"(1, 1)",0.965972,0.896686
5,logistic regression,toxic,,0.85,12000,3,"(1, 1)",0.965972,0.896686
6,logistic regression,attacks,,0.9,10000,2,"(1, 1)",0.956063,0.883812
7,logistic regression,attacks,,0.85,12000,2,"(1, 1)",0.960635,0.884822
8,logistic regression,attacks,,0.8,12000,2,"(1, 1)",0.960635,0.884822


#### Add in Regularization

In [16]:
# Create a pipeline to count vectorize and run logistic regression with regularization

pipe_reg = Pipeline([
    ('cv', CountVectorizer()),
    ('lr', LogisticRegressionCV(penalty = 'l1', solver = 'liblinear', max_iter = 1_000))
]
)

In [17]:
# Create parameters for pipeline

pipe_params_reg = {
    'cv__stop_words'  : [None, 'english'],
    'cv__max_features': [12_000, 15_000],
    'cv__min_df'      : [2, 3],
    'cv__max_df'      : [.85, .9],
    'cv__ngram_range' : [(1, 1), (2, 2)],
    'lr__Cs'          : [1]
}

In [18]:
# Instantiate gridsearch

gs_reg = GridSearchCV(
    estimator  = pipe_reg,
    param_grid = pipe_params_reg,
    cv = 5
)

In [19]:
# Fit the gridsearch

gs_reg.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cv', CountVectorizer()),
                                       ('lr',
                                        LogisticRegressionCV(max_iter=1000,
                                                             penalty='l1',
                                                             solver='liblinear'))]),
             param_grid={'cv__max_df': [0.85, 0.9],
                         'cv__max_features': [12000, 15000],
                         'cv__min_df': [2, 3],
                         'cv__ngram_range': [(1, 1), (2, 2)],
                         'cv__stop_words': [None, 'english'], 'lr__Cs': [1]})

In [22]:
gs_reg.best_params_

{'cv__max_df': 0.85,
 'cv__max_features': 12000,
 'cv__min_df': 2,
 'cv__ngram_range': (1, 1),
 'cv__stop_words': None,
 'lr__Cs': 1}

In [20]:
gs_reg.score(X_train, y_train)

0.5

In [21]:
gs_reg.score(X_test, y_test)

0.8534360839322308