In [1]:
%load_ext autoreload
%autoreload 1
%aimport pmip.feature_engineering
%aimport pmip.data

In [2]:
import os
from datetime import datetime
import urllib.request
import zipfile

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

from pmip.data import pickle_to_fs
from pmip.feature_engineering import TextSelector, NumberSelector, Tokenizer

In [3]:
RUNID=datetime.now().strftime("%Y%m%d")
DATA_DIR=os.path.join("..", "data", RUNID)

In [15]:
# load the GloVe vectors in a dictionary:

glove_data_dir = os.path.join(DATA_DIR, "..", "glove", "glove.840B.300d.txt")

if not os.path.exists(os.path.join(glove_data_dir, "glove.840B.300d.txt")):
    zip_archive = os.path.join(glove_data_dir, "glove.840B.300d.zip")
    if not os.path.isdir(glove_data_dir):
        os.makedirs(glove_data_dir)
    urllib.request.urlretrieve(
        "http://www-nlp.stanford.edu/data/glove.840B.300d.zip", 
        zip_archive,
    )
    with zipfile.ZipFile(zip_archive, "r") as zip_ref:
        zip_ref.extractall(glove_data_dir)


KeyboardInterrupt: 

In [None]:

embeddings_index = {}
with open(os.path.join(DATA_DIR, 'glove.840B.300d.txt'), 'r') as f:
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

In [5]:
if os.getenv("ENVIRONMENT", "") == "dev":
    zip_archive = os.path.join(DATA_DIR, "YouTube-Spam-Collection-v1.zip")
    if not os.path.isdir(DATA_DIR):
        os.makedirs(DATA_DIR)
    urllib.request.urlretrieve(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip", 
        zip_archive,
    )
    with zipfile.ZipFile(zip_archive, "r") as zip_ref:
        zip_ref.extractall(DATA_DIR)
        
training_files = os.listdir(DATA_DIR)
training_files

['Youtube03-LMFAO.csv',
 'Youtube04-Eminem.csv',
 'Youtube05-Shakira.csv',
 'Youtube02-KatyPerry.csv',
 '__MACOSX',
 'YouTube-Spam-Collection-v1.zip',
 'Youtube01-Psy.csv']

In [6]:
training_df_list = []
for file_ in [os.path.join(DATA_DIR, file) for file in training_files if file.endswith(".csv")]:
    df = pd.read_csv(file_, index_col=None, header=0)
    training_df_list.append(df)

training_df = pd.concat(training_df_list, axis = 0, ignore_index = True)
training_df.sample(10)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
1220,_2viQ_Qnc69Nq0Ytk1jCpzWPCrpGEk6T7cdVAxfSlAk,Shadrach Grentz,2013-07-29T17:39:24.876000,Hey Music Fans I really appreciate any of you ...,1
964,z13rgdjjjzmkuhnvf23dd1wxkqzwvjiij04,Jessica Onyekwere,2015-05-23T17:42:14.383000,"This song is special, because is a song for Af...",0
96,z13qwl2rznzohbhqy04ch3cy5tnihrkhlt40k,Thejaynetts,2015-05-23T00:53:59.385000,Never get old ﻿,0
1566,z13sx5nrhq22yvste23fvnirosixy55ag04,The Robot Green Hypno,2014-11-07T13:37:51,i like this song the video goes perfect with it﻿,0
1113,_2viQ_Qnc68hNPCfXGAxIxW9V7wcDDxSdp-gyHTkgho,ranferi delgado,2013-10-04T03:56:04.784000,best song eva,0
1813,z12ufrszxq3zstw0r22yfbipvqvaypold,josson 64,2014-11-07T15:58:57,SUPER!!! !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...,0
1512,z12dynswht2rujq1e22xi5dappq1vrlh504,ali aydın,2014-10-25T16:59:02,I like you . Katy Perry 600▲60▲6▲﻿,0
884,z12hfp2wmyuqztkw504cgblyxtbsxjuzeow0k,Jesse Pinkman,2015-05-06T11:42:44.601000,Rihanna looks so beautiful with red hair ;)﻿,0
1931,z12gwldoxpvgjru4004cj3fxyvvvwffjqjg,Ripazha Gaming,2014-11-12T17:37:08,http://hackfbaccountlive.com/?ref=5242575﻿,1
1082,_2viQ_Qnc69vgWhC2acrKSH-tvjKq1KuKBca1UtB8wk,Louis Bryant,2013-10-12T15:20:19.887000,You guys should check out this EXTRAORDINARY w...,1


https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0

In [7]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    training_df.CONTENT, 
    training_df.CLASS, 
    stratify=training_df.CLASS, 
    random_state=42, 
    test_size=0.1, 
    shuffle=True
)
print (xtrain.shape)
print (xvalid.shape)

(1760,)
(196,)


In [8]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(
    min_df=3,  
    max_features=None,
    strip_accents='unicode', 
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3), 
    use_idf=1,
    smooth_idf=1,
    sublinear_tf=1,
    stop_words = 'english'
)

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)


In [10]:
# Initialize SVD
svd = TruncatedSVD()
    
# Initialize the standard scaler 
scl = preprocessing.StandardScaler()

# We will use logistic regression here..
lr_model = LogisticRegression()

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])


In [11]:
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}


In [13]:
# Initialize Grid Search Model
model = GridSearchCV(
    estimator=clf, 
    param_grid=param_grid, 
    verbose=10, 
    n_jobs=-1, 
    iid=True, 
    refit=True, 
    cv=2
)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain


Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done  16 out of  24 | elapsed:   28.7s remaining:   14.4s
[Parallel(n_jobs=-1)]: Done  19 out of  24 | elapsed:   31.1s remaining:    8.2s
[Parallel(n_jobs=-1)]: Done  22 out of  24 | elapsed:   34.3s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   35.3s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('svd', TruncatedSVD(algorithm='randomized', n_components=2, n_iter=5,
       random_state=None, tol=0.0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svd__n_components': [120, 180], 'lr__C': [0.1, 1.0, 10], 'lr__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [14]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.939
Best parameters set:
	lr__C: 1.0
	lr__penalty: 'l1'
	svd__n_components: 180


In [None]:
pickle_to_fs()