### Imports, preparation

(Used the trick in https://github.com/googlecolab/colabtools/issues/253#issuecomment-648634717 to obtain more RAM in google colab)

In [1]:
!pip install -q torchtext==0.6.0

[K     |████████████████████████████████| 71kB 2.2MB/s 
[K     |████████████████████████████████| 1.1MB 8.8MB/s 
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ROOT_PATH = '/content/drive/My Drive/cil'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
import sys
import os

import pandas as pd
# import matplotlib.pyplot as plt
import numpy as np
import joblib

import datetime
from collections import Counter
import pickle
# from tqdm import tqdm
from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn import linear_model

Choice of parameters: word-embedding method, data preprocessing method...

In [4]:
from torchtext.vocab import GloVe
glove = GloVe(name="twitter.27B", dim="200", cache=os.path.join(ROOT_PATH, "CIL-aux-data"))

In [5]:
# PREPROCESSED_TWITTER_DATASETS_DIR = os.path.join(ROOT_PATH, "xiaochen-clean-dataset-2")
PREPROCESSED_TWITTER_DATASETS_DIR = os.path.join(ROOT_PATH, "stanford_glove_preprocessed")
TWEETS_TRAIN_FILENAME = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "dataset_stanfordglove_segmented_full.csv")
TWEETS_TEST_FILENAME = os.path.join(PREPROCESSED_TWITTER_DATASETS_DIR, "test_stanfordglove_segmented.csv")

weightparam = 1e-3 # alpha in the SIF paper
vocab_size = None # 100000 # only keep the `vocab_size` most frequent words; set to `None` to keep all the words

# other parameters are set inline (e.g params for the final linear classifier)

### Training

In [6]:
# train data
X_train_txt = pd.read_csv(TWEETS_TRAIN_FILENAME)
# X_train_txt = pd.read_csv(TWEETS_TRAIN_FILENAME, nrows=100000) # for dev: only keep nrows first samples
n_samples = X_train_txt.shape[0]
y_train = X_train_txt['label'].to_numpy().astype(np.integer, copy=False)
assert y_train.shape == (n_samples,)
X_train_txt = X_train_txt['preprocessed_segmented_tweet'].to_numpy()
assert X_train_txt.shape == (n_samples,)
X_train_txt[0], y_train[0]

('<user> i dunno justin read my mention or not . only justin and god knows about that , but i hope you will follow me <hashtag> believe <number>',
 1)

In [7]:
# test data
X_test_txt = pd.read_csv(TWEETS_TEST_FILENAME)
X_test_txt = X_test_txt['preprocessed_segmented_tweet'].to_numpy()
assert X_test_txt.shape == (10000,)
X_test_txt[0]

'sea doo pro sea scooter ( sports with the portable seadoo sea scooter save air , stay longer in the water and . <repeat> <url>'

In [8]:
# ## get word occurrence count from the training data
# occ_dict = Counter()
# for tweet in tqdm(X_train_txt):
#     words = tweet.lower().split()
#     occ_dict.update(words)

# # and also from the test data
# for tweet in tqdm(X_test_txt):
#     words = tweet.lower().split()
#     occ_dict.update(words)

# # ## get word occurrence count from an external data source
# # WORD_OCC_COUNT_FILE = os.path.join(PATH_ROOT, "CIL-aux-data/enwiki_vocab_min200.txt"
# # with open(WORD_OCC_COUNT_FILE) as f:
# #     lines = f.readlines()
# # for line in tqdm(lines):
# #     line = line.strip()
# #     line = line.split()
# #     if len(line) != 2:
# #         print(line) # something went wrong...
# #     else:
# #         occ_dict[line[0]] = int(line[1])

# len(occ_dict)

In [9]:
# tot_count = sum( count for word,count in tqdm(occ_dict.most_common(vocab_size)) )
# word2weight = {}
# for word, count in tqdm(occ_dict.most_common(vocab_size)):
#     word2weight[word] = weightparam / (weightparam + count/tot_count)

In [10]:
def mean_embed_tweet(tweet):
    words = tweet.lower().split()
    if len(words) == 0: # avoid program crashing...
        words = ["empty", "tweet"]
    word_emb = glove.get_vecs_by_tokens(words, lower_case_backup=True)
    # for idx in np.where(~word_emb.bool().all(axis=1))[0]: # print OOV words
    #     print(words[idx])
    return word_emb.mean(axis=0)

def SIF_embed_tweet(tweet):
    words = tweet.lower().split()
    if len(words) == 0: # avoid program crashing...
        words = ["empty", "tweet"]
    word_emb = glove.get_vecs_by_tokens(words, lower_case_backup=True)
    # for idx in np.where(~word_emb.bool().all(axis=1))[0]: # print OOV words
    #     print(idx, words[idx])
    word_weights = np.array([ word2weight[word] 
                             if word in word2weight else 0 # TODO: find a noninformative weight other than 0
                             for word in words ])
    return word_weights.dot(word_emb) / len(words)
    # return word_weights.dot(word_emb)


## choose the tweet embedding method.
my_embed_tweet = mean_embed_tweet

tweet_emb_dim = my_embed_tweet("this is a dummy tweet").shape[0]
tweet_emb_dim

200

In [11]:
X_train = np.empty((y_train.shape[0], tweet_emb_dim))
for i, tweet in enumerate(tqdm(X_train_txt)):
    X_train[i] = my_embed_tweet(tweet)

HBox(children=(FloatProgress(value=0.0, max=2500000.0), HTML(value='')))




In [12]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.decomposition import TruncatedSVD

from sklearn.utils import check_array

class RemovePC(TransformerMixin, BaseEstimator):
    """Removes the projection onto X's principal component"""
    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        self.princip_comp_ = TruncatedSVD(n_components=1).fit(X).components_
        return self

    def transform(self, X):
        check_array(X)
        comp = self.princip_comp_
        # if mathematical notation, 
        #     remove comp comp^T X from X
        # in numpy notation, X is n*d instead of d*n, and comp is a row vector, so: 
        #     remove comp^T comp X^T from X^T
        #     remove X comp^T comp from X
        return X - X @ comp.T * comp

from sklearn.utils.estimator_checks import check_estimator
check_estimator(RemovePC()) # passes the check!

In [13]:
ts = datetime.datetime.now().isoformat()
np.save(f"X_train__{ts}.npy", X_train)
np.save(f"y_train__{ts}.npy", y_train)

Fit a vectorial classifier

In [14]:
clf = linear_model.SGDClassifier(loss="hinge", penalty="l2", verbose=1)
# clf = LinearSVC() # too slow

# model = clf

In [15]:
pipe = Pipeline([
    ('remove_pc', RemovePC()),
    # ('standardize', StandardScaler()),
#     ('vt_feat_select', VarianceThreshold()),
#     ('feat_select', SelectPercentile(score_func=f_classif)),
    ('classification', clf),
], verbose=1)

model = pipe

In [16]:
print('_' * 80)
print("Training: ")
print(model)
from time import time
t0 = time()

model.fit(X_train, y_train)

train_time = time() - t0
print("train time: %0.3fs" % train_time)

________________________________________________________________________________
Training: 
Pipeline(memory=None,
         steps=[('remove_pc', RemovePC()),
                ('classification',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=None,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=1, warm_start=False))],
         verbose=1)
[Pipeline] ......... (step 1 of 2) Processing remove_pc, total=  18.0s
-- Epoch 1
Norm: 11.40, NNZs: 200, Bias: 0.151837, T: 2500000, Avg. loss: 0.516133
Total training time: 1.97 seconds.
-- Epoch

In [17]:
# save model
## TODO: also save princip_comp, as we need to remove projection of X_test onto it...
ts = datetime.datetime.now().isoformat()
# PATH_TO_SAVE_TRAINED_MODEL = f"trained_model__{ts}.pkl"
PATH_TO_SAVE_TRAINED_MODEL = os.path.join(ROOT_PATH, "CIL-results", f"trained_model__{ts}.pkl")
joblib.dump(model, PATH_TO_SAVE_TRAINED_MODEL)
print(f"saved model to {PATH_TO_SAVE_TRAINED_MODEL}")

saved model to /content/drive/My Drive/cil/CIL-results/trained_model__2020-07-27T15:38:30.676903.pkl


In [18]:
# predict
y_predtrain = model.predict(X_train)
# evaluate
score_train = accuracy_score(y_train, y_predtrain)
print(f"(Unvalidated) accuracy score on the training set: {score_train}")
print("confusion matrix for training set:")
print(confusion_matrix(y_train, y_predtrain))

(Unvalidated) accuracy score on the training set: 0.7892032
confusion matrix for training set:
[[ 938479  311521]
 [ 215471 1034529]]


### Predicting on the test dataset

In [19]:
# (already loaded)

# X_test_txt = pd.read_csv(TWEETS_TEST_FILENAME)
# X_test_txt = X_test_txt['preprocessed_segmented_tweet'].to_numpy()
# assert X_test_txt.shape == (10000,)
X_test_txt[0]

'sea doo pro sea scooter ( sports with the portable seadoo sea scooter save air , stay longer in the water and . <repeat> <url>'

In [20]:
X_test = np.empty((X_test_txt.shape[0], tweet_emb_dim))
for i, tweet in enumerate(tqdm(X_test_txt)):
    X_test[i] = my_embed_tweet(tweet)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [21]:
# PATH_TO_LOAD_TRAINED_MODEL = PATH_TO_SAVE_TRAINED_MODEL

# model = joblib.load(PATH_TO_LOAD_TRAINED_MODEL)

In [22]:
y_pred = model.predict(X_test)

In [23]:
y_pred = y_pred.astype(np.integer, copy=False)

y_pred[y_pred==1] = -1
y_pred[y_pred==0] = 1

In [24]:
print(f"predict {np.count_nonzero(y_pred==-1)} positive, {np.count_nonzero(y_pred==1)} negative")

predict 5313 positive, 4687 negative


In [25]:
ts = datetime.datetime.now().isoformat()
SUBMISSION_FILENAME = os.path.join(ROOT_PATH, f"SIF_baseline_submission_{ts}.csv")

with open(SUBMISSION_FILENAME, "w") as f:
    f.write("Id,Prediction\n")
    for i, label in enumerate(y_pred, start=1):  
        f.write(f"{i},{label}\n")
