In [83]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import scipy
import os

In [84]:
train_df = pd.read_csv('../data/samples/pol_train_rebalanced_cleaned.csv', sep='\t')
test_df = pd.read_csv('../data/samples/pol_test_rebalanced_cleaned.csv', sep='\t')

In [85]:
train_df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,says,Trumpbart,politics,2,-1,-1,2016-11,1478638229,s
1,0,,Julythisyear,politics,1,-1,-1,2016-10,1476545869,nice job straw manning neighbor
2,0,back original stuff fact roe v wade risk ever ...,lipsyncforyourlife,politics,7,-1,-1,2016-11,1479152336,trump s positions actually pretty consistent t...
3,0,lepage called idiot would really sit rethink e...,lipsyncforyourlife,politics,96,-1,-1,2016-10,1476983334,lepage calls trump s election results comment ...
4,0,pillage got,lipsyncforyourlife,politics,3,-1,-1,2016-10,1477885687,except fact nt office nt make mistake words vs...


In [86]:
train_df.shape

(119293, 10)

In [87]:
train_df = train_df.fillna(value='')

In [88]:
test_df = test_df.fillna(value='')

In [89]:
train_comment_flat = np.ndarray.flatten(train_df[['comment']].values)
train_parent_comment_flat = np.ndarray.flatten(train_df[['parent_comment']].values)
test_comment_flat = np.ndarray.flatten(test_df[['comment']].values)
test_parent_comment_flat = np.ndarray.flatten(test_df[['parent_comment']].values)

In [90]:
# Fit on all
count_vectorizer = CountVectorizer(ngram_range=(2,3), binary=True)
corpus =  np.append(train_comment_flat, train_parent_comment_flat)
corpus = np.append(corpus, test_comment_flat)
corpus = np.append(corpus, test_parent_comment_flat)
count_vectorizer = count_vectorizer.fit(corpus)

In [91]:
# The article I read filtered from 3-50. Let's filter by the upper bound
def get_filtered_ngram_matrix(documents, filter=50):
    print('Vectorizing...')
    vectorized = count_vectorizer.transform(documents)
    print('Original shape is: ' + str(vectorized.shape))
    vectorized = vectorized.tocsc()
    print('Done.')
    print('Filtering...')
    sum_by_cols = vectorized.sum(axis=0) # sum by rows
    which_cols = np.asarray([x > filter for x in sum_by_cols.tolist()[0]])
    print('Done')
    new_vectorized = vectorized[:, which_cols]
    print('New shape is: ' + str(new_vectorized.shape))
    return new_train_vectorized


In [92]:
train_comment_vec = count_vectorizer.transform(train_comment_flat)
train_parent_comment_vec = count_vectorizer.transform(train_parent_comment_flat)
test_comment_vec = count_vectorizer.transform(test_comment_flat)
test_parent_comment_vec = count_vectorizer.transform(test_parent_comment_flat)

In [93]:
total_vec = scipy.sparse.vstack([
    train_comment_vec,
    train_parent_comment_vec,
    test_comment_vec,
    test_parent_comment_vec
])

In [94]:
threshhold = 50
which_cols_to_keep = np.asarray([ x > threshhold for x in total_vec.sum(axis=0).tolist()[0]])

In [95]:
sum(which_cols_to_keep)

2349

In [96]:
train_comment_vec = train_comment_vec.tocsc()[:, which_cols_to_keep]
train_parent_comment_vec = train_parent_comment_vec.tocsc()[:, which_cols_to_keep]
test_comment_vec = test_comment_vec.tocsc()[:, which_cols_to_keep]
test_parent_comment_vec = test_parent_comment_vec.tocsc()[:, which_cols_to_keep]

In [97]:
train_comment_vec.shape

(119293, 2349)

## POS n-grams

In [98]:
# And also POS?
import nltk
from nltk.tokenize import word_tokenize
def extract_pos(sentence):
    tokenized_sentence = nltk.pos_tag(word_tokenize(sentence))
    return ' '.join(x[1] for x in tokenized_sentence)

In [99]:
train_comment_flat = np.ndarray.flatten(train_df[['comment']].values)
train_parent_comment_flat = np.ndarray.flatten(train_df[['parent_comment']].values)
test_comment_flat = np.ndarray.flatten(test_df[['comment']].values)
test_parent_comment_flat = np.ndarray.flatten(test_df[['parent_comment']].values)

In [100]:
train_comment_ngram = [extract_pos(x) for x in train_comment_flat]
train_parent_comment_ngram = [extract_pos(x) for x in train_parent_comment_flat]
test_comment_ngram = [extract_pos(x) for x in test_comment_flat]
test_parent_comment_ngram = [extract_pos(x) for x in test_parent_comment_flat]

In [101]:
pos_vectorizer = CountVectorizer(ngram_range=(3,5), binary=True)
corpus = train_comment_ngram + train_parent_comment_ngram + test_comment_ngram + test_parent_comment_ngram
pos_vectorizer = pos_vectorizer.fit(corpus)

In [102]:
train_comment_ngram_vec = pos_vectorizer.transform(train_comment_ngram)
train_parent_comment_ngram_vec = pos_vectorizer.transform(train_parent_comment_ngram)
test_comment_ngram_vec = pos_vectorizer.transform(test_comment_ngram)
test_parent_comment_ngram_vec = pos_vectorizer.transform(test_parent_comment_ngram)

In [103]:
total_vec = scipy.sparse.vstack([
    train_comment_ngram_vec,
    train_parent_comment_ngram_vec,
    test_comment_ngram_vec,
    test_parent_comment_ngram_vec
])

In [104]:
threshhold = 50 # The original paper says 5 but lol we're not working with 100k features 
which_cols_to_keep = np.asarray([x > threshhold for x in total_vec.sum(axis=0).tolist()[0]])

In [105]:
sum(which_cols_to_keep)

13120

In [106]:
train_comment_ngram_vec = train_comment_ngram_vec.tocsc()[:, which_cols_to_keep]
train_parent_comment_ngram_vec = train_parent_comment_ngram_vec.tocsc()[:, which_cols_to_keep]
test_comment_ngram_vec = test_comment_ngram_vec.tocsc()[:, which_cols_to_keep]
test_parent_comment_ngram_vec = test_parent_comment_ngram_vec.tocsc()[:, which_cols_to_keep]

In [107]:
train_comment_ngram_vec.shape

(119293, 13120)

In [108]:
train_vec = scipy.sparse.hstack([train_comment_vec, train_parent_comment_vec, train_comment_ngram_vec, train_parent_comment_ngram_vec])
test_vec = scipy.sparse.hstack([test_comment_vec, test_parent_comment_vec, test_comment_ngram_vec, test_parent_comment_ngram_vec])

In [109]:
train_vec.shape

(119293, 30938)

In [110]:
test_vec.shape

(29526, 30938)

In [111]:
data_base_dir = '../data/features'
scipy.sparse.save_npz(open(os.path.join(data_base_dir, "ngram_train_balanced.npz"), "wb+"), train_vec.tocoo())
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"ngram_test_balanced.npz"), "wb+"), test_vec.tocoo())