Merging the features and getting an 80/20 split.

This has a circular dependency on the ngram_filtered_extraction notebook (whoops). Check out that notebook for running details.

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import numpy as np
import scipy
import os

## Step 1: Loading in everything and making the full feature set

In [44]:
data_base_dir = '../data/features'

In [45]:
# First loading in our CSVs

def to_sparse_matrix(df):
    return scipy.sparse.csc_matrix(df.values)

pointedness_train_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, "pointedness_train.csv")))
pointedness_test_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, "pointedness_test.csv")))

synset_train_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, "synset_train.csv")))
synset_test_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, "synset_test.csv")))

frequency_train_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, 'frequency_train.csv')))
frequency_test_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, 'frequency_test.csv')))

sentiment_train_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, 'sentiment_train.csv')))
sentiment_test_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, 'sentiment_test.csv')))

In [46]:
# Then loading in Artur's pattern work

pattern_train = scipy.sparse.load_npz(os.path.join(data_base_dir, "pattern_training.npz"))
pattern_test = scipy.sparse.load_npz(os.path.join(data_base_dir, "pattern_test.npz"))

In [47]:
train_df = pd.read_csv('../data/samples/pol_train_cleaned.csv', sep='\t')
test_df = pd.read_csv('../data/samples/pol_test_cleaned.csv', sep='\t')

In [48]:
training_features = [
    pointedness_train_matr,
    synset_train_matr,
    frequency_train_matr,
    sentiment_train_matr,
    pattern_train
]

testing_features = [
    pointedness_test_matr,
    synset_test_matr,
    frequency_test_matr,
    sentiment_test_matr,
    pattern_test
]

X_train_full = scipy.sparse.hstack(training_features)
X_test_full = scipy.sparse.hstack(testing_features)

Step 1.1: Validating to ourselves that the output is what we expect

In [49]:
X_train_full.shape

(3049316, 491)

In [50]:
X_test_full.shape

(764172, 491)

Step 1.2: Writing our full data matrix

In [51]:
scipy.sparse.save_npz(open(os.path.join(data_base_dir, "X_train_full.npz"), "wb+"), X_train_full)

In [52]:
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"X_test_full.npz"), "wb+"), X_test_full)

### Step 2: Thinning things out to make a more balanced data set.

In [53]:
y_train_full = train_df[['label']].values
y_test_full = test_df[['label']].values

In [54]:
sum(y_train_full)/X_train_full.shape[0] # Roughly 0.7% of our data is positive

array([0.0078657])

In [55]:
sum(y_test_full)/X_test_full.shape[0] # And same with our testing

array([0.00774695])

In [56]:
np.random.seed(15) # For deterministic output

def get_more_balanced_data_set(X, y, df, prop_min=0.2):
    '''
    X: CSR Matrix of our features
    y: Numpy array of our labels
    prop_min: Float that represents what proportion of our data should be minority (at most)
    
    We're assuming the +1 class is positive.
    '''
    # Setting up how many negative classifications needed
    num_positive = sum(y)[0]
    num_total = X.shape[0]
    num_majority_needed = (1-prop_min) * (num_positive/prop_min)
    
    # Getting our positively classified values
    positive_indices = (y==1).flatten()
    
    # Getting our negatively classified values
    negative_indices = (y==0).flatten()
    # Then randomly removing rows until we get how many we need
    frac_needed = num_majority_needed/(num_total * 1.0)
    for i in range(len(negative_indices)):
        uniform_distribution_draw = np.random.uniform()
        if uniform_distribution_draw > frac_needed:
            negative_indices[i] = False
    
    # Merging the two together
    indices_desired = np.logical_or(positive_indices, negative_indices)
    
    # Extracting rows then returning.
    return (X[indices_desired, ], y[indices_desired], df[indices_desired])

In [57]:
X_test_full_csr = X_test_full.tocsr()
X_train_full_csr = X_train_full.tocsr()

In [58]:
X_test_balanced, y_test_balanced, test_df_balanced = get_more_balanced_data_set(X_test_full_csr, y_test_full, test_df)

In [59]:
X_train_balanced, y_train_balanced, train_df_balanced = get_more_balanced_data_set(X_train_full_csr, y_train_full, train_df)

In [60]:
# Then to validate to ourselves this worked
X_train_balanced.shape

(119293, 491)

In [61]:
X_test_balanced.shape

(29526, 491)

In [62]:
len(y_train_balanced)

119293

In [63]:
len(y_test_balanced)

29526

In [64]:
sum(y_train_balanced)/len(y_train_balanced) # Just about

array([0.20105958])

In [65]:
sum(y_test_balanced)/len(y_test_balanced) # This too

array([0.20050125])

In [66]:
# Then saving everything

In [67]:
# Then, a last minute addition:

ngram_all_train = scipy.sparse.load_npz(os.path.join(data_base_dir, "ngram_train_balanced.npz"))
ngram_all_test = scipy.sparse.load_npz(os.path.join(data_base_dir, "ngram_test_balanced.npz"))

ngram_words_only_train = scipy.sparse.load_npz(os.path.join(data_base_dir, "ngram_train_words_only_balanced.npz"))
ngram_words_only_test = scipy.sparse.load_npz(os.path.join(data_base_dir, "ngram_test_words_only_balanced.npz"))

ngram_pos_only_train = scipy.sparse.load_npz(os.path.join(data_base_dir, "ngram_train_pos_only_balanced.npz"))
ngram_pos_only_test = scipy.sparse.load_npz(os.path.join(data_base_dir, "ngram_test_pos_only_balanced.npz"))

In [68]:
X_train_balanced_all = scipy.sparse.hstack([X_train_balanced, ngram_all_train])
X_test_balanced_all = scipy.sparse.hstack([X_test_balanced, ngram_all_test])

X_train_balanced_words_only = scipy.sparse.hstack([X_train_balanced, ngram_words_only_train])
X_test_balanced_words_only = scipy.sparse.hstack([X_test_balanced, ngram_words_only_test])

X_train_balanced_pos_only = scipy.sparse.hstack([X_train_balanced, ngram_pos_only_train])
X_test_balanced_pos_only = scipy.sparse.hstack([X_test_balanced, ngram_pos_only_test])

In [69]:
print(X_train_balanced_all.shape)
print(X_train_balanced_words_only.shape)
print(X_train_balanced_pos_only.shape)

(119293, 20319)
(119293, 2039)
(119293, 18771)


In [70]:
scipy.sparse.save_npz(open(os.path.join(data_base_dir, "X_train_balanced_all.npz"), "wb+"), X_train_balanced_all.tocoo())
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"X_test_balanced_all.npz"), "wb+"), X_test_balanced_all.tocoo())

scipy.sparse.save_npz(open(os.path.join(data_base_dir, "X_train_words_only_balanced.npz"), "wb+"), X_train_balanced_words_only.tocoo())
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"X_test_words_only_balanced.npz"), "wb+"), X_test_balanced_words_only.tocoo())

scipy.sparse.save_npz(open(os.path.join(data_base_dir, "X_train_pos_only_balanced.npz"), "wb+"), X_train_balanced_pos_only.tocoo())
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"X_test_pos_only_balanced.npz"), "wb+"), X_test_balanced_pos_only.tocoo())

scipy.sparse.save_npz(open(os.path.join(data_base_dir, "X_train_no_ngram.npz"), "wb+"), X_train_balanced.tocoo())
scipy.sparse.save_npz(open(os.path.join(data_base_dir,"X_test_no_ngram.npz"), "wb+"), X_test_balanced.tocoo())


In [71]:
np.save(open(os.path.join(data_base_dir, "y_train_balanced.npy"), "wb+"), arr=y_train_balanced)
np.save(open(os.path.join(data_base_dir,"y_test_balanced.npy"), "wb+"), arr=y_test_balanced)

In [72]:
train_df_balanced.to_csv('../data/samples/pol_train_rebalanced_cleaned.csv', sep='\t', index=False)
test_df_balanced.to_csv('../data/samples/pol_test_rebalanced_cleaned.csv', sep='\t', index=False)

In [73]:
# Then, getting our feature names.
pointedness_df = pd.read_csv(os.path.join(data_base_dir, "pointedness_train.csv"))
pointedness_cols = [x for x in pointedness_df.columns]

In [74]:
synset_df = pd.read_csv(os.path.join(data_base_dir, "synset_train.csv"))
synset_cols = [x for x in synset_df.columns]

frequency_df = pd.read_csv(os.path.join(data_base_dir, 'frequency_train.csv'))
frequency_cols = [x for x in frequency_df.columns]

sentiment_df = pd.read_csv(os.path.join(data_base_dir, 'sentiment_train.csv'))
sentiment_cols = [x for x in sentiment_df.columns]

In [75]:
import json

In [76]:
pattern_cols_raw = json.load(open(os.path.join(data_base_dir, 'pattern_features_names.json'), 'r'))
pattern_cols = [str(x) for x in pattern_cols_raw]

In [83]:
pattern_cols

["['O', '101', ['in', 'a']]",
 "['O', '101', ['i', 'been']]",
 "['O', '1001', ['i', 'for']]",
 "['O', '101', ['they', 'it']]",
 "['O', '101', ['and', 'they']]",
 "['O', '1001', ['in', 'the']]",
 "['O', '101', ['could', 'a']]",
 "['O', '101', ['we', 'had']]",
 '[\'O\', \'101\', ["\'m", \'he\']]',
 '[\'O\', \'1101\', [\'i\', "\'m", \'he\']]',
 "['O', '101', ['of', 'you']]",
 "['O', '1101', ['are', 'you', 'about']]",
 "['O', '101', ['you', 'the']]",
 "['O', '101', ['a', 'more']]",
 "['O', '101', ['is', 'going']]",
 "['O', '10001', ['is', 'and']]",
 "['O', '1101', ['in', 'the', 'that']]",
 "['O', '101', ['not', 'about']]",
 "['O', '1101', ['would', 'be', 'to']]",
 "['O', '1001', ['i', 'in']]",
 "['O', '101', ['that', 'him']]",
 "['O', '101', ['and', 'him']]",
 '[\'O\', \'1101\', [\'you\', "\'re", \'the\']]',
 "['O', '1101', ['have', 'a', 'of']]",
 "['O', '101', ['your', 'is']]",
 "['O', '1001', ['i', 'that']]",
 "['O', '1001', ['i', 'to']]",
 '[\'O\', \'101\', ["n\'t", \'me\']]',
 "['O', '

In [77]:
len(pattern_cols)

468

In [78]:
pattern_train.shape[1] # Make sure to replace this with Artur's stuff

468

In [79]:
ngram_words_cols = [x for x in np.load('../data/features/ngram_words_features.npy')]

In [80]:
ngram_pos_cols = [x for x in np.load('../data/features/ngram_pos_features.npy')]

In [81]:
all_features = pointedness_cols + synset_cols + frequency_cols + sentiment_cols + pattern_cols + ngram_words_cols + ngram_pos_cols

In [82]:
np.save('../data/features/all_features_arr.npy', all_features)