In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import scipy
import os

In [2]:
train_df = pd.read_csv('../data/samples/pol_train_rebalanced_cleaned.csv', sep='\t')
test_df = pd.read_csv('../data/samples/pol_test_rebalanced_cleaned.csv', sep='\t')

In [3]:
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,says,Trumpbart,politics,2,-1,-1,2016-11,1478638229,s
1,0,,Julythisyear,politics,1,-1,-1,2016-10,1476545869,nice job straw manning neighbor
2,0,back original stuff fact roe v wade risk ever ...,lipsyncforyourlife,politics,7,-1,-1,2016-11,1479152336,trump s positions actually pretty consistent t...
3,0,lepage called idiot would really sit rethink e...,lipsyncforyourlife,politics,96,-1,-1,2016-10,1476983334,lepage calls trump s election results comment ...
4,0,pillage got,lipsyncforyourlife,politics,3,-1,-1,2016-10,1477885687,except fact nt office nt make mistake words vs...


In [4]:
train_df.shape

(119293, 10)

In [5]:
train_df = train_df.fillna(value='')

In [6]:
test_df = test_df.fillna(value='')

In [7]:
train_comment_flat = np.ndarray.flatten(train_df[['comment']].values)
train_parent_comment_flat = np.ndarray.flatten(train_df[['parent_comment']].values)
test_comment_flat = np.ndarray.flatten(test_df[['comment']].values)
test_parent_comment_flat = np.ndarray.flatten(test_df[['parent_comment']].values)

In [8]:
# Fit on all
count_vectorizer = CountVectorizer(ngram_range=(2,3), binary=True)
corpus =  np.append(train_comment_flat, train_parent_comment_flat)
corpus = np.append(corpus, test_comment_flat)
corpus = np.append(corpus, test_parent_comment_flat)
count_vectorizer = count_vectorizer.fit(corpus)

In [9]:
train_comment_vec = count_vectorizer.transform(train_comment_flat)
train_parent_comment_vec = count_vectorizer.transform(train_parent_comment_flat)
test_comment_vec = count_vectorizer.transform(test_comment_flat)
test_parent_comment_vec = count_vectorizer.transform(test_parent_comment_flat)

In [10]:
total_ngram_vec = scipy.sparse.vstack([
    train_comment_vec,
    train_parent_comment_vec,
    test_comment_vec,
    test_parent_comment_vec
])

In [11]:
threshhold = 100 # from pton paper
which_cols_to_keep = np.asarray([ x > threshhold for x in total_ngram_vec.sum(axis=0).tolist()[0]])

In [12]:
sum(which_cols_to_keep)

774

In [13]:
train_comment_vec = train_comment_vec.tocsc()[:, which_cols_to_keep]
train_parent_comment_vec = train_parent_comment_vec.tocsc()[:, which_cols_to_keep]
test_comment_vec = test_comment_vec.tocsc()[:, which_cols_to_keep]
test_parent_comment_vec = test_parent_comment_vec.tocsc()[:, which_cols_to_keep]

In [14]:
train_comment_vec.shape

(119293, 774)

## POS n-grams

In [15]:
# And also POS?
import nltk
from nltk.tokenize import word_tokenize
def extract_pos(sentence):
    tokenized_sentence = nltk.pos_tag(word_tokenize(sentence))
    return ' '.join(x[1] for x in tokenized_sentence)

In [16]:
train_comment_flat = np.ndarray.flatten(train_df[['comment']].values)
train_parent_comment_flat = np.ndarray.flatten(train_df[['parent_comment']].values)
test_comment_flat = np.ndarray.flatten(test_df[['comment']].values)
test_parent_comment_flat = np.ndarray.flatten(test_df[['parent_comment']].values)

In [17]:
train_comment_ngram = [extract_pos(x) for x in train_comment_flat]
train_parent_comment_ngram = [extract_pos(x) for x in train_parent_comment_flat]
test_comment_ngram = [extract_pos(x) for x in test_comment_flat]
test_parent_comment_ngram = [extract_pos(x) for x in test_parent_comment_flat]

In [18]:
pos_vectorizer = CountVectorizer(ngram_range=(3,6), binary=True) # from ptacek's paper
corpus = train_comment_ngram + train_parent_comment_ngram + test_comment_ngram + test_parent_comment_ngram
pos_vectorizer = pos_vectorizer.fit(corpus)

In [19]:
train_comment_ngram_vec = pos_vectorizer.transform(train_comment_ngram)
train_parent_comment_ngram_vec = pos_vectorizer.transform(train_parent_comment_ngram)
test_comment_ngram_vec = pos_vectorizer.transform(test_comment_ngram)
test_parent_comment_ngram_vec = pos_vectorizer.transform(test_parent_comment_ngram)

In [20]:
total_pos_ngram_vec = scipy.sparse.vstack([
    train_comment_ngram_vec,
    train_parent_comment_ngram_vec,
    test_comment_ngram_vec,
    test_parent_comment_ngram_vec
])

In [21]:
threshhold = 100 # The original paper says 5 but lol we're not working with 100k features 
# princeton paper utilizes 100 as threshhold as well
which_cols_to_keep = np.asarray([x > threshhold for x in total_pos_ngram_vec.sum(axis=0).tolist()[0]])

In [22]:
sum(which_cols_to_keep)

9140

In [23]:
train_comment_ngram_vec = train_comment_ngram_vec.tocsc()[:, which_cols_to_keep]
train_parent_comment_ngram_vec = train_parent_comment_ngram_vec.tocsc()[:, which_cols_to_keep]
test_comment_ngram_vec = test_comment_ngram_vec.tocsc()[:, which_cols_to_keep]
test_parent_comment_ngram_vec = test_parent_comment_ngram_vec.tocsc()[:, which_cols_to_keep]

In [25]:
train_comment_ngram_vec.shape

(119293, 9140)

In [26]:
data_base_dir = '../data/features'


In [27]:
# Then making our different data sets

# First, n-gram words only
words_only_vec_train = scipy.sparse.hstack([train_comment_vec, train_parent_comment_vec])
words_only_vec_test = scipy.sparse.hstack([test_comment_vec, test_parent_comment_vec])

scipy.sparse.save_npz(open(os.path.join(data_base_dir, "ngram_train_words_only_balanced.npz"), "wb+"), words_only_vec_train.tocoo())
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"ngram_test_words_only_balanced.npz"), "wb+"), words_only_vec_test.tocoo())

In [30]:
print(words_only_vec_train.shape)
print(words_only_vec_test.shape)

(119293, 1548)
(29526, 1548)


In [28]:
# Then, POS n-gram only
pos_only_train_vec = scipy.sparse.hstack([train_comment_ngram_vec, train_parent_comment_ngram_vec])
pos_only_test_vec = scipy.sparse.hstack([test_comment_ngram_vec, test_parent_comment_ngram_vec])

scipy.sparse.save_npz(open(os.path.join(data_base_dir, "ngram_train_pos_only_balanced.npz"), "wb+"), pos_only_train_vec.tocoo())
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"ngram_test_pos_only_balanced.npz"), "wb+"), pos_only_test_vec.tocoo())

In [33]:
print(pos_only_train_vec.shape)
print(pos_only_test_vec.shape)

(119293, 18280)
(29526, 18280)


In [29]:
# Finally, all.

train_vec = scipy.sparse.hstack([train_comment_vec, train_parent_comment_vec, train_comment_ngram_vec, train_parent_comment_ngram_vec])
test_vec = scipy.sparse.hstack([test_comment_vec, test_parent_comment_vec, test_comment_ngram_vec, test_parent_comment_ngram_vec])

scipy.sparse.save_npz(open(os.path.join(data_base_dir, "ngram_train_balanced.npz"), "wb+"), train_vec.tocoo())
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"ngram_test_balanced.npz"), "wb+"), test_vec.tocoo())

In [31]:
print(train_vec.shape)

(119293, 19828)


In [32]:
print(test_vec.shape)

(29526, 19828)
