In [1]:
%load_ext autoreload
%autoreload 1
%aimport pmip.feature_engineering
%aimport pmip.data

In [2]:
import os
from datetime import datetime
import urllib.request
import zipfile

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

from pmip.data import pickle_to_fs
from pmip.feature_engineering import TextSelector, NumberSelector, Tokenizer

In [3]:
DATA_DIR=os.path.join("..", "data")

In [4]:
training_files = os.listdir(DATA_DIR)
training_files

['model-training-20190112.ipynb',
 'Youtube03-LMFAO.csv',
 'Youtube04-Eminem.csv',
 'Youtube05-Shakira.csv',
 'Youtube02-KatyPerry.csv',
 'model.pkl',
 '__MACOSX',
 'model-training-20190112.html',
 'YouTube-Spam-Collection-v1.zip',
 'Youtube01-Psy.csv']

In [5]:
training_df_list = []
for file_ in [os.path.join(DATA_DIR, file) for file in training_files if file.endswith(".csv")]:
    df = pd.read_csv(file_, index_col=None, header=0)
    training_df_list.append(df)

training_df = pd.concat(training_df_list, axis = 0, ignore_index = True)
training_df.sample(10)

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
904,z12cydggrzyesrklw23qwzexerqavj11e,monkey moments,2015-05-27T09:24:10.239000,i love this song thumsb up to you﻿,0
557,LneaDw26bFvnMp5q__XiHBvyky8sw2uF9b-ZfDZi2to,WeTheDopeSquad,,WATCH MY VIDEOS AND SUBSCRIBE,1
1146,_2viQ_Qnc6-adCzTDLAhqNVQ5hFYcjPyPI5m7pHY4BY,Lizzy Molly,2013-09-09T17:34:07.052000,PLEASE CHECK OUT MY VIDEO CALLED &quot;WE LOVE...,1
72,z12pd3wq0p3wzzt5p04cd33pwnrreduadn40k,Brianna Reed,2015-05-25T03:02:02.615000,I LOVE YOUR SONGS﻿,0
1245,_2viQ_Qnc6_fgKR1W7-k1lbVURi8hVbMlQAMSOCSnyk,ThirdDegr3e,2013-07-13T20:48:22.967000,**CHECK OUT MY NEW MIXTAPE**** **CHECK OUT MY ...,1
278,z12hvvki4xite3sjl04cfp34wxy0fr0qjgs,latisha garcia,2015-03-09T00:33:08.463000,Check out this playlist on YouTube: I tried﻿,1
593,z121i1eqppzph3eod04cixfgwknydnfzq3k,railn j sander,2015-05-26T05:32:15.041000,I guss this song is one of my worst fears in l...,0
707,LneaDw26bFvv8RbyHRBDnA-4Bb1lhF9UlpzJf_5FkWM,이 정훈,,"This great Warning will happen soon. ,0\nLneaD...",1
1311,z13ax5sicvvxsx3lx23vuf45tye4etl1g,Kasia Hill,2014-09-06T02:44:43,http://shhort.com/a?r=G8iX5cTKd﻿,1
1302,z135drnwswvsgvkyq04cfjh4xpb3cn2hugg,Sam Klein,2014-08-31T03:52:29,"She named the tiger Kitty Purry No, seriously...",0


https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0

https://www.kaggle.com/abhikbanerjee/approaching-almost-any-nlp-problem-on-kaggle

In [6]:
xall = training_df.CONTENT
yall = training_df.CLASS
print(xall.shape)
print(yall.shape)

(1956,)
(1956,)


In [7]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    xall, 
    yall, 
    stratify=yall, 
    random_state=42, 
    test_size=0.1, 
    shuffle=True
)
print (xtrain.shape)
print (xvalid.shape)

(1760,)
(196,)


In [8]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(
    min_df=3,  
    max_features=None,
    strip_accents='unicode', 
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3), 
    use_idf=1,
    smooth_idf=1,
    sublinear_tf=1,
    stop_words = 'english'
)

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)
xtrain_tfv

<1760x2129 sparse matrix of type '<class 'numpy.float64'>'
	with 18636 stored elements in Compressed Sparse Row format>

In [9]:
# Initialize SVD
svd = TruncatedSVD()
    
# Initialize the standard scaler 
scl = preprocessing.StandardScaler()

# We will use logistic regression here..
lr_model = LogisticRegression()

# Create the pipeline 
clf = pipeline.Pipeline([('tfv', tfv),
                         ('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])


In [10]:
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}


In [11]:
# Initialize Grid Search Model
model = GridSearchCV(
    estimator=clf, 
    param_grid=param_grid, 
    verbose=10, 
    n_jobs=-1, 
    iid=True, 
    refit=True, 
    cv=5
)

# Fit Grid Search Model
model.fit(xall, yall)  # we can use the full data here


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   15.7s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   39.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfv', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=1,
    ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svd__n_components': [120, 180], 'lr__C': [0.1, 1.0, 10], 'lr__penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [12]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.927
Best parameters set:
	lr__C: 1.0
	lr__penalty: 'l2'
	svd__n_components: 120


In [13]:
pickle_to_fs(model.best_estimator_, filename="model.pkl", subdirectory=DATA_DIR)

In [14]:
xtrain.iloc[0]

'Help me get 10000000 subscribers by tomorrow!<br /><br /><br /><br /><br /><br /><br /><br /><br /><br /><br />(Joking don&#39;t get butt hurt)  \ufeff'

In [15]:
model.best_estimator_.predict(
    [
        "Check it out this free stuff!!!",
        "I take issue with your characterization.",
    ]
)

array([1, 0])