In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import numpy as np
from nltk.corpus import stopwords
import unidecode

In [2]:
df_tickets = pd.read_csv('tickets_asunto.csv', sep=';', quotechar = '\'')

In [3]:
df_tickets.dropna(inplace=True)

In [4]:
df_tickets

Unnamed: 0,tecnico,asunto,descripcion
0,63602,RE: Petición : ##268042## ha sido finalizada.,"<body> <div> <p><i><span style=""font-family: C..."
1,129127,Problemas con Notebook,"<div style=""font-family: Calibri , Arial , Hel..."
2,139841,SOLICITUD TRANSACCION K_TP_VALU,<body> Estiamdos otorgar privilegios para la t...
3,72603,ERROR EN GATILLADOR,<body> <div> <p>Estimados:</p> <p> </p> <p>Agr...
4,129127,RV: Asignar Equipo Notebook y Cuentas Correo /...,"<div> <p>Estimados,</p> <p> </p> <p>Reitero so..."
...,...,...,...
60182,146764,INSTALACION GOOGLE EARTH PRO,"<body> <div style=""font-family: Calibri, Arial..."
60183,171004,Problema en ACP Espejo para cargar tareas en s...,"<body style=""""> <div> <p class=""MsoNormal"">Est..."
60184,149280,Solicitud Configuración PC a Ploter Sr. Gabrie...,"<div><p class=""MsoNormal""><u>Sres. Soporte</u>..."
60185,165002,Agente Paillaco,"<body> <div style=""font-family: Calibri, Arial..."


In [5]:
def removeAccent(text):
    text = unidecode.unidecode(text)
    return text

In [6]:
df_tickets['asunto'] = df_tickets['asunto'].apply(removeAccent)
df_tickets['descripcion'] = df_tickets['descripcion'].apply(removeAccent)

In [7]:
data = stopwords.words('spanish')

for i in range(len(data)):    
    data[i] = removeAccent(data[i])

In [8]:
def removeString(data, regex):
    return data.str.lower().str.replace(regex.lower(), ' ', regex=True)

In [9]:
def cleanDataset(dataset, columnsToClean, regexList):
    for column in columnsToClean:
        for regex in regexList:
            dataset[column] = removeString(dataset[column], regex)
    return dataset

In [10]:
def getRegexList():
    regexList = []
    regexList += ['estimados']
    regexList += ['estimado']
    regexList += ['gt']
    regexList += ['lt']
    regexList += ['&nbsp']
    regexList += ['osorno']
    regexList += ['gerencia']
    regexList += ['href']
    regexList += ['asunto']
    regexList += ['enviado']
    regexList += ['saesa']
    regexList += ['informacion']
    regexList += ['area']
    regexList += ['ramirez']
    regexList += ['fecha']
    regexList += ['atte']
    regexList += ['correo']
    regexList += ['ayuda']
    regexList += ['gracias']
    regexList += ['piso']
    regexList += ['empresa']
    regexList += ['solicitud']
    regexList += ['comentarios']
    regexList += [' s ']
    regexList += ['bulnes']
    regexList += ['fono']
    regexList += ['eleuterio']
    regexList += ['gruposaesa']
    regexList += ['muchas']

    regexList += ['www']
    regexList += ['CC']
    regexList += ['RE:']  # received data line
    regexList += ['RV:']  # received data line
    #regexList += ['From:(.*)\r\n']  # from line
    # regexList += ['RITM[0-9]*'] # request id
    # regexList += ['INC[0-9]*'] # incident id
    # regexList += ['TKT[0-9]*'] # ticket id
    #regexList += ['Sent:(.*)\r\n']  # sent to line
    #regexList += ['Received:(.*)\r\n']  # received data line
    #regexList += ['@saesa.cl']  # received data line
    #regexList += ['To:(.*)\r\n']  # to line
    #regexList += ['CC:(.*)\r\n']  # cc line
    #regexList += ['The information(.*)infection']  # footer
    #regexList += ['Endava Limited is a company(.*)or omissions']  # footer
    #regexList += ['The information in this email is confidential and may be legally(.*)interference if you are not the intended recipient']  # footer
    regexList += ['\[cid:(.*)]']  # images cid
    regexList += ['https?:[^\]\n\r]+']  # https & http
    #regexList += ['Subject:']
    regexList += ['[\w\d\-\_\.]+@[\w\d\-\_\.]+']  # emails
    # regexList += ['[0-9][\-0–90-9 ]+']  # phones
    # regexList += ['[0-9]']  # numbers
    # regexList += ['[^a-zA-z 0-9]+']  # anything that is not a letter
    # regexList += ['[\r\n]']  # \r\n
    # regexList += [' [a-zA-Z] ']  # single letters
    # regexList += [' [a-zA-Z][a-zA-Z] ']  # two-letter words
    regexList += ['<[^<]+?>']
    regexList += ["  "]  # double spaces
    regexList += ['^[_a-z0-9-]+(\.[_a-z0-9-]+)*@[a-z0-9-]+(\.[a-z0-9-]+)*(\.[a-z]{2,4})$']
    regexList += ['[^a-zA-Z]']
    

    return regexList

In [11]:
columnsToClean = ['asunto', 'descripcion']
cleanDataset(df_tickets, columnsToClean, getRegexList())

Unnamed: 0,tecnico,asunto,descripcion
0,63602,peticion ha sido finalizada,a la no hemos recibido el lector nuevo ...
1,129127,problemas con notebook,personal de soporte soy julio ahumada ...
2,139841,transa ion k tp valu,estiamdos otorgar privilegios para la transa ...
3,72603,error en gatillador,agradecere ver el sistema de medicion ...
4,129127,asignar equipo notebook y cuentas sap,reitero saludos christia...
...,...,...,...
60182,146764,instalacion google earth pro,favor su en instalacion de aplicacio...
60183,171004,problema en acp espejo para cargar t en subzon...,favor su con problema detectado por ...
60184,149280,configuracion pc a ploter sr gabriel martinez,sres soporte junto con saludar agrade...
60185,165002,agente paillaco,favor su para incorporar agente a pc de ...


In [12]:
tecnico = df_tickets['tecnico']

In [13]:
df_tickets['todo'] = df_tickets['asunto'] + ' ' + df_tickets['descripcion']

In [14]:
del df_tickets['tecnico']
del df_tickets['asunto']
del df_tickets['descripcion']

In [15]:
X_train, X_test, y_train, y_test  = train_test_split(df_tickets['todo'], tecnico, test_size = 0.2, random_state = 1)

In [16]:
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words=data)),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=15, random_state=42)),
                        ])
_ = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)

0.6075434078258702

In [None]:
print(data)

In [None]:
counts = pd.Series(''.join(df_tickets.todo).split()).value_counts()
counts.to_csv(r'File Name.csv')

In [None]:
df_tickets2 = pd.read_csv('test1.csv', sep=';', quotechar = '\'')

In [None]:
df_tickets2['asunto'] = df_tickets2['asunto'].apply(removeAccent)
df_tickets2['descripcion'] = df_tickets2['descripcion'].apply(removeAccent)

In [None]:
cleanDataset(df_tickets2, columnsToClean, getRegexList())

In [None]:
df_tickets2['todo'] = df_tickets2['asunto'] + ' ' + df_tickets2['descripcion']

In [None]:
del df_tickets2['tecnico']
del df_tickets2['asunto']
del df_tickets2['descripcion']

In [None]:
df_tickets2

In [None]:
X_train

In [None]:
predicted_svm2 = text_clf_svm.predict(df_tickets2)

In [None]:
print(predicted_svm2)