In [16]:
import json
import pickle as cPickle
import numpy as np

from sklearn import svm
from scipy.sparse import csr_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
import random

In [2]:
UNIGRAMS_FILENAME = "bow_features/unigram20.pkl"
BIGRAMS_FILENAME = "bow_features/bigram50.pkl"
UNIGRAMS_LIST = cPickle.load(open(UNIGRAMS_FILENAME, "rb"))
BIGRAMS_LIST = cPickle.load(open(BIGRAMS_FILENAME, "rb"))

In [3]:
with open('feature_extraction/lexicons') as f:
    LEXICONS = json.load(f)

In [4]:
def features(document):
    feature_dict = {}
    feature_dict.update(_get_term_features(document))
    feature_dict.update(_get_action_features(document))
    return feature_dict

def _get_term_features(document):
    actions = document['action_feature']
    unigrams, bigrams = set([]), set([])
    end_time = 0
    for action in actions:
        if action['timestamp_in_sec'] > end_time:
            end_time = action['timestamp_in_sec'] 
    for action in actions:
        if action['timestamp_in_sec'] == end_time or \
            not(action['comment_type'] == 'COMMENT_ADDING' or\
                action['comment_type'] == 'SECTION_CREATION' or\
                action['comment_type'] == 'COMMENT_MODIFICATION'):
            continue
        unigrams = unigrams | set(action['unigrams'])
        bigrams = bigrams | set([tuple(x) for x in action['bigrams']]) 
    f = {}
    f.update(dict(map(lambda x: ("UNIGRAM_" + str(x), 1 if x in unigrams else 0), UNIGRAMS_LIST)))
    f.update(dict(map(lambda x: ("BIGRAM_" + str(x), 1 if tuple(x) in bigrams else 0), BIGRAMS_LIST)))
    return f 

In [14]:
def _get_action_features(document):
    # polarity, politeness --> max, min
    # length --> max, min
    # has_agree, has_disagree 
    
    # based on user --> balance among users(avg of each feature)
    
    # pronoun usage
    
    # lexicon ratio
    actions = document['action_feature']
    unigrams, bigrams = set([]), set([])
    end_time = 0
    for action in actions:
        if action['timestamp_in_sec'] > end_time:
            end_time = action['timestamp_in_sec'] 
    ret = {'max_polarity': -1, 'min_polarity': 1, 'max_politeness': 0, 'min_politeness': 1, 'max_length': 0, \
        'min_length': 1000000}
    for key in LEXICONS.keys():
        ret[key] = 0
    ret['length'] = 0
    has_politeness = False
    for action in actions:
        if action['timestamp_in_sec'] == end_time or \
            not(action['comment_type'] == 'COMMENT_ADDING' or\
                action['comment_type'] == 'SECTION_CREATION' or\
                action['comment_type'] == 'COMMENT_MODIFICATION'):
            continue
        # Polarity
        polarity = []
        for p in action['polarity']:
            polarity.append(p['compound'])
        ret['max_polarity'] = max(ret['max_polarity'], np.average(polarity))
        ret['min_polarity'] = min(ret['min_polarity'], np.average(polarity))

        
        # Politeness
        if action['is_request']:
            ret['max_politeness'] = max(ret['max_politeness'], action['politeness_score']['polite'])
            ret['min_politeness'] = min(ret['min_politeness'], action['politeness_score']['polite'])
            has_politeness = True
        
        # Pronoun
        for key in LEXICONS.keys():
            ret[key] += action[key]
            
        ret['length'] += action['length'] 
        ret['max_length'] = max(ret['max_length'], action['length'])
        ret['min_length'] = min(ret['min_length'], action['length'])

    for key in LEXICONS.keys():
        ret[key] /= ret['length']
    if not(has_politeness):
        ret['max_politeness'] = ret['min_politeness'] = 0.5
    return ret

In [6]:
def documents2feature_vectors(documents):
    fks = False
    X, y = [], []
    cnt = 0
    for pair in documents:
        conversation, clss = pair
        fs = features(conversation)
        if not fks:
            fks = sorted(fs.keys())
        fv = [fs[f] for f in fks]
        if cnt % 1000 == 0:
            print(cnt)
        cnt += 1
        X.append(fv)
        y.append(clss)
    X = csr_matrix(np.asarray(X))
    y = np.asarray(y)
    return X, y

In [7]:
def train_svm(X, y):

    # For good luck

    print("Fitting")
    tuned_parameters = [#{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'C': [0.0002, 0.0003, 0.0004, 0.00045]}]

    clf = GridSearchCV(svm.LinearSVC(), tuned_parameters, cv=5, scoring = 'accuracy')
    clf.fit(X, y)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print()

#    print(scores.mean())
#    print scores

In [8]:
documents = []
with open('/scratch/wiki_dumps/train_test/train.json') as f:
    for line in f:
        conv_id, clss, conversation = json.loads(line)
        documents.append((conversation, clss))       


In [None]:
"""
176208    unigram total                                                                                               
1354891   bigram total 
UNIGRAM:
15324     10                                                                                                  
11853   15                                                                                              
9826    20                                                                                                    
5386    50                                                                                                    
3354    100

BIGRAM
55380  10                                                                                                      
30036  20                                                                                                     
12967  50                                                                                                    
6613   100                                                                                                     
1130   500
"""

In [None]:
# C = 0.00075
# 57.6%
# unigram50, bigram100

# C = 0.00075
# 57.4%
# unigram100, bigram100

# C = 0.00075
# 57.4%
# unigram50, bigram500

# C= 0.0006
# 58%
# unigram20, bigram100

# 0.581 (+/-0.003) for {'C': 0.0004}
# unigram20, bigram50

In [15]:
random.shuffle(documents)
X, y = documents2feature_vectors(documents)

['2008-11-27 19:34:14 UTC', '2008-11-27 16:43:21 UTC', '2008-11-27 16:43:21 UTC', '2008-11-27 18:29:48 UTC', '2008-11-27 18:31:23 UTC']
True
0
['2014-06-23 16:14:32 UTC', '2014-06-23 16:14:32 UTC', '2014-06-23 16:15:42 UTC']
True
['2010-01-14 02:02:55 UTC', '2010-01-14 02:02:55 UTC', '2010-02-03 00:09:51 UTC', '2010-02-03 00:08:52 UTC', '2010-01-14 02:03:39 UTC']
True
['2011-10-16 10:07:59 UTC', '2010-09-30 14:12:50 UTC', '2010-09-30 14:12:50 UTC']
False
['2008-08-27 14:43:27 UTC', '2008-08-26 16:53:27 UTC', '2008-08-26 14:42:13 UTC', '2008-08-26 19:32:41 UTC', '2008-08-26 16:31:59 UTC', '2008-08-26 15:27:37 UTC', '2008-08-27 14:43:27 UTC', '2008-08-26 14:42:13 UTC']
True
['2007-04-30 19:20:36 UTC', '2007-05-01 01:46:06 UTC', '2007-04-30 19:20:36 UTC']
False
['2009-06-07 17:27:20 UTC', '2009-06-07 17:27:20 UTC', '2009-06-07 17:27:20 UTC', '2009-06-07 17:27:20 UTC', '2009-06-07 17:27:20 UTC', '2009-06-07 17:27:20 UTC', '2009-06-07 17:27:20 UTC', '2009-06-07 17:27:20 UTC', '2009-06-07 17

ZeroDivisionError: division by zero

In [40]:
train_svm(X, y)

Fitting
Best parameters set found on development set:

LinearSVC(C=0.0004, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Grid scores on development set:

0.569 (+/-0.007) for {'C': 0.0002}
0.574 (+/-0.006) for {'C': 0.0003}
0.581 (+/-0.003) for {'C': 0.0004}
0.579 (+/-0.003) for {'C': 0.00045}





In [12]:
train_svm(X, y)

Fitting
Best parameters set found on development set:

LinearSVC(C=0.005, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Grid scores on development set:

0.558 (+/-0.002) for {'C': 0.01}
0.555 (+/-0.002) for {'C': 0.015}
0.564 (+/-0.002) for {'C': 0.005}





In [14]:
train_svm(X, y)

Fitting
Best parameters set found on development set:

LinearSVC(C=0.0025, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Grid scores on development set:

0.565 (+/-0.001) for {'C': 0.0001}
0.569 (+/-0.003) for {'C': 0.0025}





In [16]:
train_svm(X, y)

Fitting
Best parameters set found on development set:

LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Grid scores on development set:

0.573 (+/-0.002) for {'C': 0.001}
0.569 (+/-0.002) for {'C': 0.002}
0.567 (+/-0.003) for {'C': 0.003}
0.565 (+/-0.002) for {'C': 0.004}





In [18]:
train_svm(X, y)

Fitting
Best parameters set found on development set:

LinearSVC(C=0.00075, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Grid scores on development set:

0.574 (+/-0.005) for {'C': 0.0005}
0.576 (+/-0.002) for {'C': 0.00075}





In [20]:
train_svm(X, y)

Fitting
Best parameters set found on development set:

LinearSVC(C=0.0008, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

Grid scores on development set:

0.575 (+/-0.002) for {'C': 0.0008}
0.574 (+/-0.002) for {'C': 0.0009}



