In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import joblib

import datetime
import os
import numpy as np
import time
import multiprocessing as mp
import re 

In [2]:
import inspect, os
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
os.sys.path.insert(0,parentdir) 
from data_generation.diff_utils import clean_and_filter

# Run Models

#### Load Models

In [3]:
m_agg = joblib.load( '../../models/aggression_ngram.pkl')
m_rec = joblib.load( '../../models/recipient_ngram.pkl')

#### Load annotationed diffs

In [17]:
def apply_models(df):
    diffs = df['clean_diff']
    agg_scores = m_agg.predict_proba(df['clean_diff'])
    df['pred_aggression_score'] = agg_scores.dot(np.array([1, 0, -1]))
    df['pred_aggressive'] = agg_scores[:, 0]
    df['pred_neutral'] = agg_scores[:, 1]
    df['pred_friendly'] = agg_scores[:, 2]
    rec_scores = m_rec.predict_proba(df['clean_diff'])
    df['pred_recipient_score'] = rec_scores[:,1]
    return df
    

In [19]:
annotations = load_annotations()

for ns in ['user', 'article']:

    d_annotations = annotations[ns]['random']
    
    d_annotations['aggression'] = (d_annotations['aggression'] -1) * -1

    d_annotated = d_annotations\
                .drop_duplicates(subset=['rev_id'])\
                .assign(
                    recipient = plurality(d_annotations['recipient'].dropna()),
                    recipient_score = average(d_annotations['recipient'].dropna()),
                    aggression = plurality(d_annotations['aggression'].dropna()),
                    aggression_score = average(d_annotations['aggression'].dropna()))

    d_annotated.to_csv('../../data/samples/%s/clean/d_annotated.tsv' % ns, sep = '\t')

#### Load samples and apply models

We take various diffs datasets from hive, apply the clean and filter function and the score the clean diffs using the models.

In [20]:
def pred_helper(df):
    if len(df) == 0:
        return None
    
    return df.assign(rev_timestamp = lambda x: pd.to_datetime(x.rev_timestamp),
                     clean_diff = lambda x: x['clean_diff'].astype(str))\
             .pipe(apply_models)

    
def prep_in_parallel(path, k = 8):
    df = pd.read_csv(path, sep = '\t', encoding = 'utf-8')\
           .assign(key = lambda x: np.random.randint(0, high=5*k, size=x.shape[0]))
    dfs = [e[1] for e in df.groupby('key')]
    p = mp.Pool(k)
    dfs = p.map(pred_helper, dfs)
    p.close()
    p.join()
    return pd.concat(dfs)

In [22]:
base = '../../data/samples/'
nss = ['user', 'article']
samples = ['talk_diff_no_admin_sample.tsv', 'talk_diff_no_admin_2015.tsv', 'all_blocked_user.tsv', 'd_annotated.tsv']

base_cols = ['rev_id', 'clean_diff', 'rev_timestamp', 'pred_aggression_score', 'pred_recipient_score', 'page_title', 'user_text','user_id']
extra_cols = ['recipient', 'recipient_score', 'aggression', 'aggression_score']

for ns in nss:
    for s in samples:
        inf = os.path.join(base, ns, 'clean', s)
        outf = os.path.join(base, ns, 'scored', s)
        if s == 'd_annotated.tsv':
            cols = base_cols + extra_cols
        else:
            cols = base_cols
        prep_in_parallel(inf, k = 4)[cols].to_csv(outf, sep = '\t', index = False)