Merging the features.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
import pandas as pd
import numpy as np
import scipy
import os

## Step 1: Loading in everything and making the full feature set

In [2]:
data_base_dir = '../data/features'

In [4]:
# First loading in our CSVs

def to_sparse_matrix(df):
    return scipy.sparse.csc_matrix(df.values)

pointedness_train_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, "pointedness_train.csv")))
pointedness_test_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, "pointedness_test.csv")))

synset_train_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, "synset_train.csv")))
synset_test_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, "synset_test.csv")))

frequency_train_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, 'frequency_train.csv')))
frequency_test_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, 'frequency_test.csv')))

sentiment_train_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, 'sentiment_train.csv')))
sentiment_test_matr = to_sparse_matrix(pd.read_csv(os.path.join(data_base_dir, 'sentiment_test.csv')))

In [5]:
# Then loading in Artur's pattern work

pattern_train = scipy.sparse.load_npz(os.path.join(data_base_dir, "pattern_training.npz"))
pattern_test = scipy.sparse.load_npz(os.path.join(data_base_dir, "pattern_test.npz"))

In [16]:
training_features = [
    pointedness_train_matr,
    synset_train_matr,
    frequency_train_matr,
    sentiment_train_matr,
    pattern_train
]

testing_features = [
    pointedness_test_matr,
    synset_test_matr,
    frequency_test_matr,
    sentiment_test_matr,
    pattern_test
]

X_train_full = scipy.sparse.hstack(training_features)
X_test_full = scipy.sparse.hstack(testing_features)

Step 1.1: Validating to ourselves that the output is what we expect

In [17]:
X_train_full.shape

(3049316, 491)

In [18]:
X_test_full.shape

(764172, 491)

Step 1.2: Writing our full data matrix

In [20]:
np.savez(open(os.path.join(data_base_dir, "X_train_full.npz"), "wb+"), arr=X_train_full)

In [21]:
np.savez(open(os.path.join(data_base_dir,"X_test_full.npz"), "wb+"), arr=X_test_full)

### Step 2: Thinning things out to make a more balanced data set.

In [22]:
y_train_full = pd.read_csv('../data/samples/pol_train_cleaned.csv', sep='\t')[['label']].values
y_test_full = pd.read_csv('../data/samples/pol_test_cleaned.csv', sep='\t')[['label']].values

In [33]:
sum(y_train_full)/X_train_full.shape[0] # Roughly 0.7% of our data is positive

array([0.0078657])

In [35]:
sum(y_test_full)/X_test_full.shape[0] # And same with our testing

array([0.00774695])

In [66]:
np.random.seed(15) # For deterministic output

def get_more_balanced_data_set(X, y, prop_min=0.2):
    '''
    X: CSR Matrix of our features
    y: Numpy array of our labels
    prop_min: Float that represents what proportion of our data should be minority (at most)
    
    We're assuming the +1 class is positive.
    '''
    # Setting up how many negative classifications needed
    num_positive = sum(y)[0]
    num_total = X.shape[0]
    num_majority_needed = (1-prop_min) * (num_positive/prop_min)
    
    # Getting our positively classified values
    positive_indices = (y==1).flatten()
    
    # Getting our negatively classified values
    negative_indices = (y==0).flatten()
    # Then randomly removing rows until we get how many we need
    frac_needed = num_majority_needed/(num_total * 1.0)
    for i in range(len(negative_indices)):
        uniform_distribution_draw = np.random.uniform()
        if uniform_distribution_draw > frac_needed:
            negative_indices[i] = False
    
    # Merging the two together
    indices_desired = np.logical_or(positive_indices, negative_indices)
    
    # Extracting rows then returning.
    return (X[indices_desired, ], y[indices_desired])

In [62]:
X_test_full_csr = X_test_full.tocsr()
X_train_full_csr = X_train_full.tocsr()

In [67]:
X_test_balanced, y_test_balanced = get_more_balanced_data_set(X_test_full_csr, y_test_full)

In [69]:
X_train_balanced, y_train_balanced = get_more_balanced_data_set(X_train_full_csr, y_train_full)

In [74]:
# Then to validate to ourselves this worked
X_train_balanced.shape

(119293, 491)

In [75]:
X_test_balanced.shape

(29526, 491)

In [76]:
len(y_train_balanced)

119293

In [78]:
len(y_test_balanced)

29526

In [79]:
sum(y_train_balanced)/len(y_train_balanced) # Just about

array([0.20105958])

In [82]:
sum(y_test_balanced)/len(y_test_balanced) # This too

array([0.20050125])

In [None]:
# Then saving everything

In [83]:
np.savez(open(os.path.join(data_base_dir, "X_train_balanced.npz"), "wb+"), arr=X_train_balanced)
np.savez(open(os.path.join(data_base_dir,"X_test_balanced.npz"), "wb+"), arr=X_test_balanced)

In [84]:
np.save(open(os.path.join(data_base_dir, "y_train_balanced.npy"), "wb+"), arr=y_train_balanced)
np.save(open(os.path.join(data_base_dir,"y_test_balanced.npy"), "wb+"), arr=y_test_balanced)