In [2]:

import sys
import pandas as pd
import numpy as np
import re 
import nltk 
import datetime 
import matplotlib.pyplot as plt
import uuid
import ujson
import sqlite3
import sys
import psycopg2
import shutil
import os
import logging 
import csv 


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, TruncatedSVD

from nltk.corpus import stopwords
from nltk import SnowballStemmer

from scipy.sparse import hstack


In [8]:
# OPTIONS 
REFRESH_DATABASE = 0

In [9]:
# Create table with corpora information 

try:
    conn = psycopg2.connect("host=localhost dbname=toxic_ml_features user=postgres")
except:
    print("I am unable to connect to the database")
    
# create cursor 
c = conn.cursor()

if REFRESH_DATABASE:
    print("Dropping classifiers table and removing Results/preds")
    c.execute('DROP TABLE corpora')
    c.execute('DROP TABLE document_stats')
#         try: 
#             shutil.rmtree('./Results/preds') 
#         except:
#             print("directory ./Results/preds does not exist")

#     os.makedirs('./Results/preds')    

c.execute('''CREATE TABLE IF NOT EXISTS corpora (
             corpus_id text,
             train_corpus_file text, 
             test_corpus_file json, 
             train_vectorizer text, 
             test_vectorizer text
             )''')

c.execute('''CREATE TABLE IF NOT EXISTS document_stats (
             feature_names json,
             processing_options json
         )''')

conn.commit()
conn.close()

    



In [3]:

# FEATURE GENERATION 

OVERLAP_MODE = 'remove'
DO_REPEAT_SCORE = 1
DOTEST = 0
DO_NUM_NONZERO = 0
DO_NUM_REPEATS = 0 
DO_TOKENIZE_AND_STEM=1
DO_CURSEWORD_COUNT = 1

def tokenize_and_stem(comment, stemmer = SnowballStemmer("english")):
    """
    Takes a reddit comment and stemmer as input
    and returns a list of stemmed words
    
    params: 
        str comment: comment to stem 
        nltk.stemmer stemmer: nltk stemmer object 
        
    rtype: ls[str]
    """
    comment = comment.strip().split()
    stemmed_comment = [stemmer.stem(word) for word in comment]
    return stemmed_comment

# def generate_features( OVERLAP_MODE = 'remove', DO_REPEAT_SCORE = 1, 
#     DO_NUM_NONZERO = 0, DO_NUM_REPEATS = 0, DOTEST = 0):
#     """
#     bool DOTEST: should we run in test mode?
#     """


In [5]:

# CORPUS GENERATION 

#### Options ##############################################################

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# parameters for vectorizer
ANALYZER = "word" # unit of features are single words rather then phrases of words
STRIP_ACCENTS = 'unicode'
TOKENIZER = None
NGRAM_RANGE = (0,2) # Range for n-grams
MAX_DF = 0.8  # Exclude words that are contained in more than x percent of documents
MIN_DF = 50   # Exclude words that appear in less than 50 documents 

###########################################################################
# read in data
test_file = 'data/test.csv'
train_file = 'data/train.csv'

tr = pd.read_csv(train_file)
te = pd.read_csv(test_file)

if DOTEST:
    tr = tr.iloc[1:1000]
    te = te.iloc[1:1000]
    MIN_DF = 10
    MAX_DF = 1.0
    
Ntr = tr.shape[0]
Nte = te.shape[0]

logging.info( """
    Options
    OVERLAP_MODE: {}
    DO_REPEAT_SCORE: {}
    DOTEST: {}

    ANALYZE: {}
    STRIP_ACCENTS: {}
    TOKENIZER: {}
    NGRAM_RANGE: {}
    MAX_DF: {}
    MIN_DF: {}
    """.format( OVERLAP_MODE, DO_REPEAT_SCORE, DOTEST, ANALYZER, \
        STRIP_ACCENTS, TOKENIZER, NGRAM_RANGE, MAX_DF, MIN_DF) )

train_ids = tr.id
test_ids = te.id

def preprocess_comment( comment, RE_PREPROCESS = \
    re.compile(r""" \W + # one or more nonword characters
        |    # the or operator
        \d+  # digits""", re.VERBOSE), 
    URL_PREPROCESS = re.compile(r"""[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)""", \
        re.VERBOSE)):
    """
    str comment:
    compiled regular expression: RE_PREPROCESS 
    """
    return re.sub( RE_PREPROCESS, ' ', comment).lower()

if DO_TOKENIZE_AND_STEM:
    TOKENIZER = tokenize_and_stem
else: 
    TOKENIZER = None 
    
tr_vectorizer = CountVectorizer(analyzer=ANALYZER,
                            tokenizer = TOKENIZER, 
                            ngram_range=NGRAM_RANGE,
                            preprocessor = preprocess_comment, 
                            stop_words = stopwords.words('english'),
                            strip_accents=STRIP_ACCENTS,
                            min_df = MIN_DF,
                            max_df = MAX_DF)

te_vectorizer = CountVectorizer(analyzer=ANALYZER,
                            tokenizer = TOKENIZER, 
                            ngram_range=NGRAM_RANGE,
                            preprocessor = preprocess_comment,
                            stop_words = stopwords.words('english'),
                            strip_accents=STRIP_ACCENTS,
                            min_df = MIN_DF,
                            max_df = MAX_DF)

# tokenize
logging.info("tokenizing comments")
train_bag_of_words = tr_vectorizer.fit_transform( tr['comment_text'])
test_bag_of_words = te_vectorizer.fit_transform( te['comment_text'])

tr_vocab = tr_vectorizer.get_feature_names()
te_vocab = te_vectorizer.get_feature_names()

if OVERLAP_MODE:
    logging.info("Removing overlapping vocabulary in test and train")
    # compute percent overlap in vocabulary
    overlap = set(tr_vocab).intersection( te_vocab )

    # Remove any features that aren't in both training and test
    train_in_overlap  = [i for i, word in enumerate(tr_vocab) if word in overlap]
    test_in_overlap  = [i for i, word in enumerate(te_vocab) if word in overlap]
    tr_vocab = [tr_vocab[i] for i in train_in_overlap]
    te_vocab = [te_vocab[i] for i in test_in_overlap]
    train_bag_of_words = train_bag_of_words[:,train_in_overlap]
    test_bag_of_words = test_bag_of_words[:, test_in_overlap]


#return train_bag_of_words, test_bag_of_words, train_feat, test_feat

INFO:root:
    Options
    OVERLAP_MODE: remove
    DO_REPEAT_SCORE: 1
    DOTEST: 0

    ANALYZE: word
    STRIP_ACCENTS: unicode
    TOKENIZER: None
    NGRAM_RANGE: (0, 2)
    MAX_DF: 0.8
    MIN_DF: 50
    
INFO:root:tokenizing comments
INFO:root:Removing overlapping vocabulary in test and train


In [6]:
# PRE VECTORIZED FEATURE GENERATION 

# regexp preprocessing
RE_PREPROCESS = re.compile(r""" \W + # one or more nonword characters
                               |    # the or operator
                               \d+  # digits""", re.VERBOSE)
train_comments = list( tr.comment_text )
test_comments = list( te.comment_text )
processed_tr_comments = [ re.sub(RE_PREPROCESS, ' ', comment).lower() for comment in train_comments]
processed_te_comments = [ re.sub(RE_PREPROCESS, ' ', comment).lower() for comment in test_comments] 

train_feat = pd.DataFrame( index = tr.index )
test_feat = pd.DataFrame( index = te.index )

# Generate repeat score
if DO_REPEAT_SCORE:
    logging.info("Computing repeat score")
    
    tr_num_nonzero, tr_repeat_score, tr_num_repeats = [], [], []
    for comment in processed_tr_comments:
        vec = comment.strip().split() 
        tr_num_nonzero.append( len(set(vec)))
        tr_num_repeats.append( len(vec))
        tr_rep_score = ( len( set( vec)) + 1)/(len(vec) + 1) - 1
        tr_repeat_score.append( tr_rep_score) 
        
    te_num_nonzero, te_repeat_score, te_num_repeats = [], [], []
    for comment in processed_te_comments:
        vec = comment.strip().split() 
        te_num_nonzero.append( len(set(vec)))
        te_num_repeats.append( len(vec))
        te_rep_score = ( len( set( vec)) + 1)/(len(vec) + 1) - 1
        te_repeat_score.append( te_rep_score) 

    # Create Feature DataFrame
    test_feat['repeat_score'] = te_repeat_score
    train_feat['repeat_score'] = tr_repeat_score 

    
if DO_NUM_NONZERO:
    train_feat['num_nonzero'] = tr_num_nonzero
    test_feat['num_nonzero'] = te_num_nonzero
    
if DO_NUM_REPEATS: 
    test_feat['num_repeats'] = te_num_repeats
    train_feat['num_repeats'] = tr_num_repeats

INFO:root:Computing repeat score


In [None]:
# CURSE WORD COUNTER 
if DO_CURSEWORD_COUNT: 
    curseword_file = 'wordlists/cursewords.csv'
    curseword_list = []
    with open( curseword_file , 'r') as f:
        reader = csv.reader(f)
        curseword_list = list(reader)

    curseword_list = [l[0] for l in curseword_list]

    # regexp preprocessing 
    # (Different than for repeat_score, we're keeping digits because some cursewords contain digits, 
    # e.g., assh0le )
    RE_PLUS_URL_PREPROCESS = re.compile(r"""\W+ | [-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)""", re.VERBOSE)
    train_comments = list( tr.comment_text )
    test_comments = list( te.comment_text )
    processed_tr_comments = [ re.sub(RE_PLUS_URL_PREPROCESS, ' ', comment).lower() \
                             for comment in train_comments]
    processed_te_comments = [ re.sub(RE_PLUS_URL_PREPROCESS, ' ', comment).lower() \
                             for comment in test_comments] 

    curse_overlap = set(curseword_list).intersection( tr_vocab2 )
    indices = [ tr_vocab2.index( word ) for word in curse_overlap]

    train_feat['curseword_count'] = train_bag_of_words[:, indices].sum(1)
    test_feat['curseword_count'] = test_bag_of_words[:, indices].sum(1)