In [2]:
import numpy as np
import scipy
import re
import pandas as pd
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from hunspell import Hunspell
import string
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer, TfidfVectorizer

# ---uncomment this code if the custom libraries needs to be reloaded from disk
#import importlib
#import src
#importlib.reload(src.s3)
#importlib.reload(src.transformation)
#importlib.reload(src.tunning)

# Custom libraries
from src.transformation import TxtFeatureSelector, CategoricalFeatureSelector
from src.s3 import get_file, get_comments

h = Hunspell('es-CR', hunspell_data_dir='/home/ec2-user/SageMaker/user-feedback/include/huspell/dic/es-CR')
spanishStemmer=SnowballStemmer("spanish", ignore_stopwords=True)
correctedToken = pd.read_json('/home/ec2-user/SageMaker/user-feedback/include/correction.json', orient='index')

In [119]:
test_predict = pd.DataFrame(data=[
    {
        'creator_department': "Customer Success",
        'resource_type': "contact",
        'comment': "Conectar la aplicación con Google Calendar"
    },
    {
        'creator_department': "Sales",
        'resource_type': "contact",
        'comment': "Cambiar el tamaño de la letra a la factura"
    }
], index=[1, 2])
test_predict.head()

Unnamed: 0,creator_department,resource_type,comment
1,Customer Success,contact,Conectar la aplicación con Google Calendar
2,Sales,contact,Cambiar el tamaño de la letra a la factura


In [127]:
variables['resource_type']

Unnamed: 0_level_0,resource_type
id,Unnamed: 1_level_1
74421406,contact
75323747,contact
75594152,contact
75594918,contact
75597209,deal
...,...
224560920,contact
224628362,lead
225013006,deal
225018816,deal


In [129]:
data = pd.read_csv("/home/ec2-user/SageMaker/user-feedback/include/proccesed_comments.csv", index_col='id')
variables = data.drop('primary_category', axis = 1)
response = data['primary_category'].values
variables.loc[:,['resource_type']] = variables['resource_type'].apply(lambda x: x.strip())

In [121]:
class Preprocessing( BaseEstimator, TransformerMixin ):
    # not used
    def fit( self, X, y = None):
        return self
    
    # return tokens column as received
    def convert(self, X, CallBack):
        return pd.DataFrame(X.apply(lambda row: CallBack(self.getToken(row)), axis=1), columns=['tokens'])

    def getToken(self, row):
        return row['tokens']

class LowerCaser(Preprocessing):
    # returns numpy.ndarray
    def toLower(self, tokens):
        return np.char.lower(tokens)
    
    # Convert the column to lower case
    def transform( self, X, y = None ):
        return self.convert(X=X, CallBack=self.toLower)
    
class PuntuationRemover(Preprocessing):
    # returns numpy.ndarray
    def removePuntuation(self, tokens):
        result = np.empty((0,0), dtype=str, order='C')
        for token in tokens:
            if token not in string.punctuation:
                result = np.append (result, token)
        return result
    
    # remove puntuation symbols from the tokens
    def transform( self, X, y = None ):
        return self.convert(X=X, CallBack=self.removePuntuation)

class Striper(Preprocessing):
    # returns numpy.ndarray
    def strip(self, tokens):
        result = np.empty((0,0), dtype=str, order='C')
        for token in tokens:
            token = token.strip()
            if token != "":
                result = np.append (result, token)
        return result
    
    # strip each token removing empty spaces
    def transform( self, X, y = None ):
        return self.convert(X=X, CallBack=self.strip)

class StopWordsRemover(Preprocessing):
    # returns numpy.ndarray
    def removeStopWords(self, tokens):
        result = np.empty((0,0), dtype=str, order='C')
        for token in tokens:
            if token in stopwords.words('spanish'):
                continue
            # remove the subjet of the sentences
            if str(token).lower() in ['leonardo.quintanilla','hulihealth.com','dulce.rodriguez','liz.barrantes','hulilabs.com','doctor', 'doctora', 'dr', 'dra', 'secretaria', 'secretario', 'doctores', 'doctoras', 'cemim', 'http', 'https']:
                continue
            result = np.append (result, token)
        return result
    
    # remove stops words from the sequence of tokens
    def transform( self, X, y = None ):
        return self.convert(X=X, CallBack=self.removeStopWords)

class TokenCleaner(Preprocessing):
    # returns numpy.ndarray
    def tokenCleaner(self, tokens):
        result = np.empty((0,0), dtype=str, order='C')
        for token in tokens:
            token = re.sub('[^A-Za-z]+', '', token)
            if token.isalpha():
                result = np.append (result, token)
        return result
    
    # remove special characters from word
    def transform(self, X, y = None ):
        return self.convert(X=X, CallBack=self.tokenCleaner)
    
class WordCorrector(Preprocessing):
    # returns numpy.ndarray
    def correctWord(self, tokens):
        result = np.empty((0,0), dtype=str, order='C')                
        for token in tokens:
            token = str(token)
            if not h.spell(token):
                if token in correctedToken.index:
                    token = correctedToken.loc[token,:][0]
                else:
                    correction = h.suggest(token)
                    # take the first suggestion as the corrected value
                    if len(correction) != 0:
                        token = correction[0]
            # some of the token are identify to different words so we are separating those words
            separatedTokens = word_tokenize(token)
            result = np.append(result, separatedTokens)
        return result
    
    # correct words in the sentence
    def transform(self, X, y = None ):
        return self.convert(X=X, CallBack=self.correctWord)
    
class Stemmer(Preprocessing):
    # returns numpy.ndarray
    def stemm(self, tokens):
        result = np.empty((0,0), dtype=str, order='C')                
        for token in tokens:
            result = np.append(result, spanishStemmer.stem(str(token)))
        return result
    
    # Get the root part of the word to reduce dimensionality
    def transform(self, X, y = None ):
        return self.convert(X=X, CallBack=self.stemm) 

class SingleLetterRemover(Preprocessing):
    # returns numpy.ndarray
    def removeSingleLetter(self, tokens):
        result = np.empty((0,0), dtype=str, order='C')
        for token in tokens:
            if len(str(token)) > 1:
                result = np.append(result, token)
        return result
    
    # remove all single letter token
    def transform(self, X, y = None ):
        return self.convert(X=X, CallBack=self.removeSingleLetter)  

In [122]:
preprocess = ColumnTransformer([
    ('preprocess', Pipeline(steps=[
        ('LowerCaser', LowerCaser()),
        ('PuntuationRemover', PuntuationRemover()),
        ('Striper', Striper()),
        ('TokenCleaner', TokenCleaner()),
        ('WordCorrector', WordCorrector()),
        ('StopWordsRemover', StopWordsRemover()),
        ('SingleLetterRemover', SingleLetterRemover()),
        ('Stemmer', Stemmer()),
    ]), ['tokens']),
], remainder='passthrough')
test_predict['tokens'] = test_predict.apply(lambda row:  word_tokenize(row['comment']), axis=1)
proccessed = preprocess.fit_transform(test_predict.reset_index())
proccessed = pd.DataFrame(data=proccessed, columns=["tokens","index","creator_department","resource_type","comment"])
proccessed['comment'] = proccessed['tokens'].apply(lambda x: ' '.join(x))
proccessed = proccessed[['creator_department','resource_type','comment']]

In [123]:
proccessed.head()

Unnamed: 0,creator_department,resource_type,comment
0,Customer Success,contact,conect aplic googl calendari
1,Sales,contact,cambi tam letr factur


# Fit model

In [137]:
svm_optimal_pipeline = Pipeline(steps = [
    ('all',  FeatureUnion(transformer_list = [
        ('cat_feature', Pipeline(steps = [
            ('selector', CategoricalFeatureSelector()),
            ('encoding', OneHotEncoder())
        ])), 
       ('txt_feature', Pipeline(steps = [
            ('selector', TxtFeatureSelector()),
            ('vectorizer', TfidfVectorizer(ngram_range=(1,1), binary=False)),
        ]))
    ])),
    #('fect_selec', SelectKBest(chi2, k=1200)),
    ('model', SVC(kernel='linear', gamma="scale", C=2, probability=True))
])

In [138]:
svm_optimal_pipeline.fit(variables[['creator_department','resource_type','comment']], response)

Pipeline(memory=None,
         steps=[('all',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('cat_feature',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  CategoricalFeatureSelector()),
                                                                 ('encoding',
                                                                  OneHotEncoder(categories='auto',
                                                                                drop=None,
                                                                                dtype=<class 'numpy.float64'>,
                                                                                handle_unknown='error',
                                                                                sparse=True))],
                      

In [142]:
svm_optimal_pipeline.predict_proba(proccessed)

array([[0.93592181, 0.00792153, 0.0059089 , 0.00453867, 0.00364923,
        0.04205986],
       [0.00487574, 0.0643802 , 0.83370136, 0.00512442, 0.00365564,
        0.08826265]])