In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np
from nltk.corpus import stopwords
import unidecode
from nltk.stem import SnowballStemmer

ModuleNotFoundError: No module named 'nltk'

In [2]:
df_tickets = pd.read_csv('tickets_asunto.csv', sep=';', quotechar = '\'')

In [3]:
df_tickets.dropna(inplace=True)

In [4]:
df_tickets

Unnamed: 0,tecnico,asunto,descripcion
0,63602,RE: Petición : ##268042## ha sido finalizada.,"<body> <div> <p><i><span style=""font-family: C..."
1,129127,Problemas con Notebook,"<div style=""font-family: Calibri , Arial , Hel..."
2,139841,SOLICITUD TRANSACCION K_TP_VALU,<body> Estiamdos otorgar privilegios para la t...
3,72603,ERROR EN GATILLADOR,<body> <div> <p>Estimados:</p> <p> </p> <p>Agr...
4,129127,RV: Asignar Equipo Notebook y Cuentas Correo /...,"<div> <p>Estimados,</p> <p> </p> <p>Reitero so..."
...,...,...,...
60182,146764,INSTALACION GOOGLE EARTH PRO,"<body> <div style=""font-family: Calibri, Arial..."
60183,171004,Problema en ACP Espejo para cargar tareas en s...,"<body style=""""> <div> <p class=""MsoNormal"">Est..."
60184,149280,Solicitud Configuración PC a Ploter Sr. Gabrie...,"<div><p class=""MsoNormal""><u>Sres. Soporte</u>..."
60185,165002,Agente Paillaco,"<body> <div style=""font-family: Calibri, Arial..."


In [5]:
def removeAccent(text):
    text = unidecode.unidecode(text)
    return text

In [6]:
df_tickets['asunto'] = df_tickets['asunto'].apply(removeAccent)
df_tickets['descripcion'] = df_tickets['descripcion'].apply(removeAccent)

In [7]:
data = stopwords.words('spanish')

for i in range(len(data)):    
    data[i] = removeAccent(data[i])

In [8]:
def removeString(data, regex):
    return data.str.lower().str.replace(regex.lower(), ' ', regex=True)

In [9]:
def cleanDataset(dataset, columnsToClean, regexList):
    for column in columnsToClean:
        for regex in regexList:
            dataset[column] = removeString(dataset[column], regex)
    return dataset

In [10]:
def getRegexList():
    regexList = []
    regexList += ['estimados']
    regexList += ['estimado']
    regexList += ['gt']
    regexList += ['lt']
    regexList += ['&nbsp']
    regexList += ['osorno']
    regexList += ['gerencia']
    regexList += ['href']
    regexList += ['asunto']
    regexList += ['enviado']
    regexList += ['saesa']
    regexList += ['area']
    regexList += ['ramirez']
    regexList += ['fecha']
    regexList += ['atte']
    regexList += ['ayuda']
    regexList += ['gracias']
    regexList += ['piso']
    regexList += ['empresa']
    regexList += ['bulnes']
    regexList += ['fono']
    regexList += ['eleuterio']
    regexList += ['gruposaesa']
    regexList += ['muchas']
    regexList += ['\xa0']
    regexList += ['[0-9]']
    regexList += ['www']
    regexList += ['CC:']
    regexList += ['RE:']  # received data line
    regexList += ['RV:']  # received data line
    regexList += ['\[cid:(.*)]']  # images cid
    regexList += ['https?:[^\]\n\r]+']  # https & http
    regexList += ['<[^<]+?>']
    regexList += ['[^\w\s]']

    return regexList

In [11]:
columnsToClean = ['asunto', 'descripcion']
cleanDataset(df_tickets, columnsToClean, getRegexList())

Unnamed: 0,tecnico,asunto,descripcion
0,63602,peticion ha sido finalizada,a la no hemos recibido el lector n...
1,129127,problemas con notebook,personal de soporte soy julio ah...
2,139841,solicitud transaccion k_tp_valu,estiamdos otorgar privilegios para la transa...
3,72603,error en gatillador,agradecere ver el sistema de med...
4,129127,asignar equipo notebook y cuentas correo sap,reitero solicitud saludos ...
...,...,...,...
60182,146764,instalacion google earth pro,favor su en instalacion de a...
60183,171004,problema en acp espejo para cargar t s en subz...,favor su con problema detecta...
60184,149280,solicitud configuracion pc a ploter sr gabrie...,sres soporte junto con saludar ag...
60185,165002,agente paillaco,favor su para incorporar agente a pc ...


In [12]:
tecnico = df_tickets['tecnico']

In [13]:
df_tickets['todo'] = df_tickets['asunto'] + ' ' + df_tickets['descripcion']

In [14]:
del df_tickets['tecnico']
del df_tickets['asunto']
del df_tickets['descripcion']

In [15]:
X_train, X_test, y_train, y_test  = train_test_split(df_tickets['todo'], tecnico, test_size = 0.2, random_state = 1)

In [44]:
stemmer = SnowballStemmer('spanish', ignore_stopwords=True)

In [50]:
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words=data, ngram_range=(1, 2))

In [37]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=data, ngram_range=(1, 2))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=0.001)),
                    ])
text_clf = text_clf.fit(X_train, y_train)

In [38]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.6701836005649248

In [18]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
             }

In [19]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)



In [20]:
gs_clf.best_score_

0.6593415723335757

In [21]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [51]:
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                             ('tfidf', TfidfTransformer()),
                             ('mnb', MultinomialNB(alpha=0.001)),
                            ])

In [52]:
text_mnb_stemmed = text_mnb_stemmed.fit(X_train, y_train)
predicted_mnb_stemmed = text_mnb_stemmed.predict(X_test)
np.mean(predicted_mnb_stemmed == y_test)

0.6681066710974495