In [6]:
import pandas as pd
import numpy as np

# NLP
import string 
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Preprocessing & Model Preparation
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# Performance Evaluation
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
!pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Using cached scikit_learn-0.23.2-cp38-cp38-win_amd64.whl (6.8 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-2.1.0-py3-none-any.whl (12 kB)
Using legacy setup.py install for sklearn, since package 'wheel' is not installed.
Installing collected packages: threadpoolctl, scikit-learn, sklearn
    Running setup.py install for sklearn: started
    Running setup.py install for sklearn: finished with status 'done'
Successfully installed scikit-learn-0.23.2 sklearn-0.0 threadpoolctl-2.1.0


You should consider upgrading via the 'e:\springboard\works\support_ticket_label_classification\venv\scripts\python.exe -m pip install --upgrade pip' command.


In [2]:
# Loading data into pandas dataframe.
file_location = '../../data/support_ticket.csv'

df = pd.read_csv(file_location)

In [3]:
df.shape

(579621, 12)

In [9]:
df.columns

Index(['Unnamed: 0', 'full_text', 'submitted_via', 'sub_label', 'label',
       'word_count', 'unique_word_count', 'stop_word_count', 'url_count',
       'mean_word_length', 'char_count', 'punctuation_count'],
      dtype='object')

In [7]:
''' Tokenize a sentence into separate words, 
    and filter out words that are stopwords, punctuation, 
    numbers or have the form 'XXX' which indicates classified data.'''

def tokenize_sentence(sentence: str, stop_words=True, punctuation=True, numbers=True, classified=True)->list:
    """
    Tokenize a given string, and return the words as a list.
    The function offers functionality to exclude the words that are either
    1) a stopword 2) punctuation symbol 3) a number or 4) has the format 'XX'
    or 'XXXX' indicates the words that classififed
    """
    
    tokenized = [x.lower() for x in word_tokenize(sentence)]
    
    if classified:
        tokenized = [x for x in tokenized if x.lower() != 'xxxx' and
                    x.lower() != 'xx' and x.lower() != 'xx/xx/xxxx']
    
    if stop_words:
        tokenized = [x for x in tokenized if x not in stop_words]
     
    if punctuation:
        tokenized = [x for x in tokenized if x not in string.punctuation]
    
    if numbers:
        tokenized = [x for x in tokenized if not x.isdigit()]
        
    return tokenized

In [8]:
stop_words = set(stopwords.words('english')) 

In [15]:
def lemmatize_sentence(sentence, return_form = 'string'):
    """
    Lemmatize a given string . 
    
    Input:
    ------
        sentence: 
            Sentence that we want to lemmatize each word. The input can be
            of the form of tokens (list) or the complete sentence (string).
        return_form: 
            Format of the return function. Can be either a string
            with the concatenated lemmatized words or a list of the 
            lemmatized words.
    Returns:
    -------
        If join_string = True then the function returns the
        lemmatized words as a sentence. Else it returns the words as a list.
    """
    # Handle the case where the input is the string without being tokenized
    if type(sentence) != list:
        sentence = re.findall(r"[\w']+|[.,!?;]", sentence)

    lemmatizer = WordNetLemmatizer()
    if return_form == 'string':
        return ' '.join([lemmatizer.lemmatize(word) for word in sentence])
    else:
        return [lemmatizer.lemmatize(word) for word in sentence]

In [None]:
# Pickle the file so we do not need to reprocess each time
pickle_processed_df_filename = 'complaints_processed.pkl'
pickled_file_loc = os.path.join(project_dir, 'Data', pickle_processed_df_filename)

In [None]:
# df to pickle
complaints_processed.to_pickle(pickled_file_loc)

In [None]:
# pickled file to df
complaints_processed = pd.read_pickle(pickled_file_loc)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

complaints_processed['Product_Id'] = label_encoder.fit_transform(complaints_processed['Product'])

In [None]:
# Also its good to have the categories as a dictionary
product_map = complaints_processed.set_index('Product_Id').to_dict()['Product']
product_map

In [None]:
complaints_processed.head(10)

In [None]:
X = complaints_processed['Complaint_Clean']
y = complaints_processed['Product_Id']

In [None]:

y_train.groupby(y_train).count()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 5))
sns.barplot(x=sorted(y_train.unique()), y=y_train.groupby(y_train).count(), ax=ax1).set_title('Number of Complaints - Training Set')
sns.barplot(x=sorted(y_test.unique()), y=y_test.groupby(y_test).count(), ax=ax2).set_title('Number of Complaints - Test Set')
fig.tight_layout()
plt.show()