In [None]:
%load_ext autoreload
%autoreload 1
%aimport pmip.feature_engineering
%aimport pmip.data

In [None]:
import os
from datetime import datetime
import urllib.request
import zipfile

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import pandas as pd
import numpy as np

from pmip.data import pickle_to_fs
from pmip.feature_engineering import TextSelector, NumberSelector, Tokenizer

In [None]:
DATA_DIR=os.path.join("..", "data")

In [None]:
if os.getenv("ENVIRONMENT", "") == "dev":
    zip_archive = os.path.join(DATA_DIR, "YouTube-Spam-Collection-v1.zip")
    if not os.path.isdir(DATA_DIR):
        os.makedirs(DATA_DIR)
    urllib.request.urlretrieve(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00380/YouTube-Spam-Collection-v1.zip", 
        zip_archive,
    )
    with zipfile.ZipFile(zip_archive, "r") as zip_ref:
        zip_ref.extractall(DATA_DIR)
        
training_files = os.listdir(DATA_DIR)
training_files

In [None]:
training_df_list = []
for file_ in [os.path.join(DATA_DIR, file) for file in training_files if file.endswith(".csv")]:
    df = pd.read_csv(file_, index_col=None, header=0)
    training_df_list.append(df)

training_df = pd.concat(training_df_list, axis = 0, ignore_index = True)
training_df.sample(10)

https://medium.com/@chrisfotache/text-classification-in-python-pipelines-nlp-nltk-tf-idf-xgboost-and-more-b83451a327e0

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(
    training_df.CONTENT, 
    training_df.CLASS, 
    stratify=training_df.CLASS, 
    random_state=42, 
    test_size=0.1, 
    shuffle=True
)
print (xtrain.shape)
print (xvalid.shape)

In [None]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(
    min_df=3,  
    max_features=None,
    strip_accents='unicode', 
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3), 
    use_idf=1,
    smooth_idf=1,
    sublinear_tf=1,
    stop_words = 'english'
)

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)


In [None]:
# Initialize SVD
svd = TruncatedSVD()
    
# Initialize the standard scaler 
scl = preprocessing.StandardScaler()

# We will use logistic regression here..
lr_model = LogisticRegression()

# Create the pipeline 
clf = pipeline.Pipeline([('svd', svd),
                         ('scl', scl),
                         ('lr', lr_model)])


In [None]:
param_grid = {'svd__n_components' : [120, 180],
              'lr__C': [0.1, 1.0, 10], 
              'lr__penalty': ['l1', 'l2']}


In [None]:
# Initialize Grid Search Model
model = GridSearchCV(
    estimator=clf, 
    param_grid=param_grid, 
    verbose=10, 
    n_jobs=-1, 
    iid=True, 
    refit=True, 
    cv=2
)

# Fit Grid Search Model
model.fit(xtrain_tfv, ytrain)  # we can use the full data here but im only using xtrain


In [None]:
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
pickle_to_fs(model.best_estimator_, filename="model.pkl", subdirectory=DATA_DIR)