In [1]:
# import ssl
# import os
# try:
#     _create_unverified_https_context = ssl._create_unverified_context
# except AttributeError:
#     pass
# else:
#     ssl._create_default_https_context = _create_unverified_https_context

import nltk
import h2o
from h2o.automl import H2OAutoML
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download("stopwords")
nltk.download('words')
stop_english = set(stopwords.words('english'))
#English corpus added and functions to remove english/non english
english_words = set(x.lower() for x in nltk.corpus.words.words())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ramsay\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Ramsay\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [15]:
def clean(df,field):
	print("start cleaning")
	df[field] = df[field].str.replace('[^a-zA-Z ]', '')
	df[field].str.lower()
	df = df.dropna(how='all')
	print("cleaning done")
	return df


def stem_sentences(sentence):
	tokens = sentence.split()
	#Drop stopwords
	stemmed_tokens = [stemmer.stem(token) for token in tokens]
	return ' '.join(stemmed_tokens)

def stem_df(df,field,language): 
	print("Stemming")
	
	global stemmer
	stemmer = SnowballStemmer(language)
	df[field] = df[field].apply(stem_sentences)
	
	return df
    
def drop_stop_df(df,field):
	print("Dropping stop")
	
	df[field] = df[field].apply(drop_stop_sentence)
	
	return df

def drop_stop_sentence(sentence):
	tokens = sentence.split()
	stopped_tokens = [x for x in tokens if x not in stop_english]
	return ' '.join(stopped_tokens)

def remove_english(df,field):
	print("Dropping english")
	df[field] = df[field].apply(drop_eng_sentence)
	
	return df

def drop_eng_sentence(sentence):
	tokens = sentence.split()
	stopped_tokens = [x for x in tokens if x not in english_words]
	return ' '.join(stopped_tokens)

def remove_non_english(df,field):
	print("Dropping english")
	df[field] = df[field].apply(drop_non_eng_sentence)
	
	return df

def drop_non_eng_sentence(sentence):
	tokens = sentence.split()
	stopped_tokens = [x for x in tokens if x in english_words]
	return ' '.join(stopped_tokens)

def apply_tfidf(verbs):
    tfidf = TfidfVectorizer(sublinear_tf= True,
                            min_df= 3,
                            norm= 'l2',
                            encoding= 'latin-1',
                            ngram_range= (1,2),
                            stop_words= 'english',
#                             max_features = 40000,
                           )
    features = tfidf.fit_transform(verbs)
    
    return features, tfidf

In [25]:
def build(features, target, target_name, conf):
            try:
                h2o.cluster().shutdown()
            except:
                pass
            h2o.init(max_mem_size=200)
            print(f"Train data size : {features.shape}, targets size :{target.shape}")
            print(f"Targets : {target.unique()}")
            target= target.astype('category')
            target= target.cat.codes
            obs = target.shape[0]

            if obs>=conf['min_train_records'] and len(target.unique())>1:
                feature_list= features.columns
                features[target_name] = target
                
                # build model on training data
                print(features)
                train_data_h = h2o.H2OFrame(features)
                train_data_h[target_name] = train_data_h[target_name].asfactor()
                training_types = (train_data_h.types)

                y = target_name
                x = list(feature_list)
                
                # Partition data into 80% ,20% chunks
                splits = train_data_h.split_frame(ratios=[conf['train_perc']], seed=1)
                train = splits[0]
                valid = splits[1]
                
                print("AutoML Starting ")
                try:
                    aml = H2OAutoML(max_models = int(conf['max_models_fit']),
                                    max_runtime_secs =conf['max_model_runtime'],
                                    seed = 1234,
                                    stopping_metric=conf['stopping_measure'],
                                    balance_classes=True,
                                    nfolds=conf['nfolds'],
                                    include_algos = conf['include_algos'])

                    aml.train(  x = x,
                                y = y,
                                training_frame =train,
                                validation_frame=valid)

                    ###Uncomment below to enable scoring
                    model_path = conf['model_folder']

                    model_name = h2o.save_model(aml.leader,
                                                path= model_path,
                                                force=True)


                    # c_w.write(f'ndicate/model/dtypes_{aml.leader.model_id}.json',training_types,'processing')  ##USED WHERE MODEL HAS SPECIFIC DTYPES PER FEATURE

                    print(f"\n\n\t\t\t\t\t{aml.leader.model_id}\n\n")

                    return(True)
                except Exception as E:
                    print(f"Failiure Training for {target_name} ")
                    print(E)
                    return False
            else:
                print(f"Not enough training records to build: {target_name}")
                return(False)

In [7]:
# Format Data
f = open("data/smsspamcollection/SMSSpamCollection", "r")
targets = []
texts = []
for line in f:
    columns = line.split("\t", 1)
    texts.append(columns[1])
    if columns[0] == "ham":
        targets.append(0)
    elif columns[0] == "spam":
        targets.append(1)
    else:
        print("Error")
        break
# for i in range(5):
#     print("{}: {}".format(targets[i], texts[i]))

0: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

0: Ok lar... Joking wif u oni...

1: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

0: U dun say so early hor... U c already then say...

0: Nah I don't think he goes to usf, he lives around here though



In [13]:
df = pd.DataFrame()
df["target"] = targets
df["text"] = texts
df.head(5)

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...\n
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
text_field = "text"
text_only = clean(df, text_field)
text_only = remove_non_english(text_only, text_field)
dropped_stop_text = drop_stop_df(text_only, text_field)
stemmed_text = stem_df(dropped_stop_text, text_field, "english")

stemmed_text.head(5)


start cleaning
cleaning done
Dropping english
Dropping stop
Stemming


Unnamed: 0,target,text
0,0,point crazi n great world la e buffet got wat
1,0,lar u
2,1,entri win final st receiv entri appli
3,0,dun say earli c alreadi say
4,0,dont think goe around though


In [22]:
conf = {
    "text_field": "text",
#     "id_fields" : [BI_ID, audio_id],
	"segment": True,
	"low_memory": False,
    "train_perc": 0.9,
	"min_retrain_cnt": 50,
	"min_train_records": 10, 
    "train_perc": 0.8,
    "model_train_start_date": "1900-01-01",
    "max_models_fit": 5,
    "max_model_runtime": 3600,
    "nfolds": 0,
    "accuracy_measure": "AUC",
    "stopping_measure": "AUC",
    "include_algos": ["GLM", "DRF", "GBM"],
    "exclude_algos": ["StackedEnsemble", "XGBoost", "DeepLearning"],
    "model_folder": "/data/models/"
}

In [None]:
print(f"Starting Training for Spam")
train_df = stemmed_text.loc[~stemmed_text["target"].isna()]
features, tfidf = apply_tfidf(train_df[conf['text_field']])
tfidf_df = pd.DataFrame(features.todense())
if tfidf_df.shape[0] != train_df.shape[0]:
    print("Mis match on training data sizes")
    print("TFIDF Shape {}".format(tfidf_df.shape))
    print("Train Shape {}".format(train_df.shape))
else:
    features_df = pd.merge(tfidf_df, train_df.reset_index(drop=True), left_index=True, right_index=True).copy()  
    training_completed = build(features_df, train_df["target"], "target", conf)
    if training_completed:   
        print("Completed Training")

Starting Training for Spam
H2O session _sid_b114 closed.
Checking whether there is an H2O instance running at http://localhost:54321 ..

In [7]:
# CampaignNumber=3
# Client = 'Affinity'
# BI_ID = 'id'
# audio_id = 'audio_file'
# text_field = 'transcript'

In [11]:
##Manually remove targets if required
# sub_training_df = sub_training_df.drop(sub_training_df.columns[[1, 2, 3, 4]], axis=1)

In [191]:
# Shutdown h2o after build
h2o.cluster().shutdown()