In [35]:
# import libraries
import pandas as pd
import numpy as np

import sqlite3
from sqlalchemy import create_engine
import os

import re
import nltk
import pickle

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.base import BaseEstimator, TransformerMixin

import warnings
warnings.simplefilter('ignore')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
fp_clean_data = './data/final/clean_data_version_20210620_0200.csv'
df_clean = pd.read_csv(fp_clean_data).rename(columns = {'label':'Y', 'text':'X'}).drop(columns = 'dataset')
X = df_clean.X
Y = df_clean.Y

In [3]:
def tokenize(text):
    ''' Natural Language Processing: Normalize, Tokenize, Stem/Lemmatize
    '''
    
    # Convert text to lowercase and remove punctuation
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    # Tokenize words
    tokens = word_tokenize(text)
    
    # Remove Stop words, Stem & Lemmed words
    stop_word = stopwords.words("english")
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    #stemmed = [stemmer.stem(w) for w in tokens if w not in stop_word]
    lemmed = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_word]
    
    return lemmed

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_jobs = -1))
])

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state = 0)

# seed
np.random.seed(0)
pipeline.fit(X_train, Y_train)

# predictions
Y_train_pred = pipeline.predict(X_train)
Y_test_pred = pipeline.predict(X_test)


In [19]:
pipeline._get_param_names()

['memory', 'steps', 'verbose']

In [None]:


parameters = {
    'vect__min_df': [5,10],
    'tfidf__use_idf':[True, False],
    'clf__n_estimators': [20, 30, 50],
    'clf__max_depth': [2, 4],
    'clf__min_samples_leaf': [2, 4]
             }

scorer = make_scorer(performance_metric)
cv = RandomizedSearchCV(pipeline, param_distributions=parameters, scoring = 'f1', verbose = 10)

# Find best parameters
np.random.seed(0)
tuned_model = cv.fit(X_train, Y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START clf__max_depth=4, clf__min_samples_leaf=2, clf__n_estimators=30, tfidf__use_idf=True, vect__min_df=10
[CV 1/5; 1/10] END clf__max_depth=4, clf__min_samples_leaf=2, clf__n_estimators=30, tfidf__use_idf=True, vect__min_df=10;, score=0.000 total time=   1.4s
[CV 2/5; 1/10] START clf__max_depth=4, clf__min_samples_leaf=2, clf__n_estimators=30, tfidf__use_idf=True, vect__min_df=10
[CV 2/5; 1/10] END clf__max_depth=4, clf__min_samples_leaf=2, clf__n_estimators=30, tfidf__use_idf=True, vect__min_df=10;, score=0.013 total time=   1.4s
[CV 3/5; 1/10] START clf__max_depth=4, clf__min_samples_leaf=2, clf__n_estimators=30, tfidf__use_idf=True, vect__min_df=10
[CV 3/5; 1/10] END clf__max_depth=4, clf__min_samples_leaf=2, clf__n_estimators=30, tfidf__use_idf=True, vect__min_df=10;, score=0.013 total time=   1.4s
[CV 4/5; 1/10] START clf__max_depth=4, clf__min_samples_leaf=2, clf__n_estimators=30, tfidf__use_idf=True, v

In [41]:
estimator.get_params().keys()

NameError: name 'estimator' is not defined