In [48]:
import pandas as pd
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import xgboost, numpy, textblob, string, pickle
from nltk.corpus import stopwords

In [49]:
# Load Data
data = pd.read_csv("../input/sample.csv",names=['desc', 'label'], header=None)
data.head()

Unnamed: 0,desc,label
0,Depreciation of tangible fixed assets - owned ...,d_and_a
1,Cost of equity settled share based payments,adjustment
2,Amortisation of software,d_and_a
3,Auditor's remuneration: Audit,other
4,Operating lease payables - equipment,other


In [50]:
# Clean Text
def cleanText(text):
    # Remove unwanted /r, /n, spaces and quotes
    textParsed = text.replace("\r", " ")
    textParsed = textParsed.replace("\n", " ")
    textParsed = textParsed.replace("    ", " ")
    textParsed = textParsed.replace('""', '')

    # Convert text to lower case
    textParsed = textParsed.lower()

    # Remove punctuations
    punctuation_signs = list("?:!.,);-/(")
    for punct_sign in punctuation_signs:
        textParsed = textParsed.replace(punct_sign, '')

    # Remove's
    textParsed = textParsed.replace("'s", "")
    
    # Remove stopwords
    stop_words = stopwords.words('english')
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        textParsed = textParsed.replace(regex_stopword, '')
    return textParsed

In [51]:
descParsed = []
for text in data['desc']:
    textparsed = cleanText(text)
    descParsed.append(textparsed)
data['desc_parsed'] = descParsed
data.head()

Unnamed: 0,desc,label,desc_parsed
0,Depreciation of tangible fixed assets - owned ...,d_and_a,depreciation of tangible fixed assets owned b...
1,Cost of equity settled share based payments,adjustment,cost of equity settled share based payments
2,Amortisation of software,d_and_a,amortisation of software
3,Auditor's remuneration: Audit,other,auditor remuneration audit
4,Operating lease payables - equipment,other,operating lease payables equipment


In [53]:
desc = data['desc']
label = data['label']

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
label = encoder.fit_transform(label)

train_x, test_x, train_y, test_y = model_selection.train_test_split(desc, label,test_size=0.20, random_state=10)

In [54]:
# create count vectorizer feature 
count_vect = CountVectorizer()
#count_vect.fit(desc)
train_count =  count_vect.fit_transform(train_x)
test_count =  count_vect.transform(test_x)

In [55]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(desc)
train_tfidf =  tfidf_vect.transform(train_x)
test_tfidf =  tfidf_vect.transform(test_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(desc)
train_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
test_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(desc)
train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x) 

In [56]:
def train_model(classifier, feature_train, label, feature_test, model_name, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_train, label)
    with open(model_name, 'wb') as picklefile:
        pickle.dump(classifier,picklefile)
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_test)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, test_y)

In [57]:
# NAIVE BAYES

# Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), train_count, train_y, test_count, 'NaiveBayesCV')
print ("NB, Count Vectors: ", accuracy)

# Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), train_tfidf, train_y, test_tfidf, 'NaiveBayesTfidf')
print ("NB, WordLevel TF-IDF: ", accuracy)

# Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), train_tfidf_ngram, train_y, test_tfidf_ngram, 'NaiveBayesTfidfNgram')
print ("NB, N-Gram Vectors: ", accuracy)

# Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), train_tfidf_ngram_chars, train_y, test_tfidf_ngram_chars, 'NaiveBayesTfidfNgramChars')

print ("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.9357142857142857
NB, WordLevel TF-IDF:  0.95
NB, N-Gram Vectors:  0.8357142857142857
NB, CharLevel Vectors:  0.9642857142857143


In [58]:
# LOGISTIC REGRESSION

# Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), train_count, train_y, test_count, 'LogisticRegressionCV')
print ("LR, Count Vectors: ", accuracy)

# Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), train_tfidf, train_y, test_tfidf, 'LogisticRegressionTfidf')
print ("LR, WordLevel TF-IDF: ", accuracy)

# Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), train_tfidf_ngram, train_y, test_tfidf_ngram, 'LogisticRegressionTfidfNgram')
print ("LR, N-Gram Vectors: ", accuracy)

# Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), train_tfidf_ngram_chars, train_y, test_tfidf_ngram_chars, 'LogisticRegressionTfidfNgramChars')
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.95
LR, WordLevel TF-IDF:  0.9571428571428572
LR, N-Gram Vectors:  0.7928571428571428
LR, CharLevel Vectors:  0.9714285714285714




In [59]:
# RANDOM FOREST

# Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), train_count, train_y, test_count, 'RandomForestCV')
print ("RF, Count Vectors: ", accuracy)

# Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), train_tfidf, train_y, test_tfidf, 'RandomForestTfidf')
print ("RF, WordLevel TF-IDF: ", accuracy)

# Ngram Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), train_tfidf_ngram, train_y, test_tfidf_ngram, 'RandomForestTfidfNgram')
print ("RF, Count Vectors: ", accuracy)

# Character Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), train_tfidf_ngram_chars, train_y, test_tfidf_ngram_chars, 'RandomForestTfidfNgramChars')
print ("RF, WordLevel TF-IDF: ", accuracy)

RF, Count Vectors:  0.9571428571428572
RF, WordLevel TF-IDF:  0.9357142857142857
RF, Count Vectors:  0.7714285714285715
RF, WordLevel TF-IDF:  0.9428571428571428




In [60]:
#EXTREME GRADIENT BOOSTING

# Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), train_count.tocsc(), train_y, test_count.tocsc(),'XGBoostCV')
print ("Xgb, Count Vectors: ", accuracy)

# Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), train_tfidf.tocsc(), train_y, test_tfidf.tocsc(),'XGBoostTfidf')
print ("Xgb, WordLevel TF-IDF: ", accuracy)

# Ngram Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), train_tfidf_ngram.tocsc(), train_y, test_tfidf_ngram.tocsc(),'XGBoostTfidfNgram')
print ("Xgb, Ngram Vectors: ", accuracy)

# Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), train_tfidf_ngram_chars.tocsc(), train_y, test_tfidf_ngram_chars.tocsc(),'XGBoostTfidfNgramChars')
print ("Xgb, CharLevel Vectors: ", accuracy)



Xgb, Count Vectors:  0.9428571428571428
Xgb, WordLevel TF-IDF:  0.9142857142857143
Xgb, Ngram Vectors:  0.8214285714285714
Xgb, CharLevel Vectors:  0.9642857142857143


In [61]:
with open('../input/LogisticRegressionCV', 'rb') as training_model:
    LRCVModel = pickle.load(training_model)
with open('../input/LogisticRegressionTfidf', 'rb') as training_model:
    LRTfidfModel = pickle.load(training_model)
with open('../input/LogisticRegressionTfidfNgramChars', 'rb') as training_model:
    LRTfidfNgramCharsModel = pickle.load(training_model)

In [62]:
while(1):
    text = input("Enter a text to categorize Otherwise press 'q' to exit:")
    if (text == 'q') or (text == 'Q'):
        break
    text = [cleanText(text)]
    #text = []
    #text.append(textCleaned)
    text
    text_cv = count_vect.transform(text)
    text_tfidf = tfidf_vect.transform(text)
    text_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(text)
    pred1 = LRTfidfNgramCharsModel.predict(text_tfidf_ngram_chars)
    pred2 = LRCVModel.predict(text_cv)
    pred3 = LRTfidfModel.predict(text_tfidf)
    print('\n\nCategory based on Character level ngrams using Logistic Regrssion - ', encoder.inverse_transform(pred1)[0])
    print('Category based on count vector using Logistic Regrssion - ', encoder.inverse_transform(pred2)[0])
    print('Category based on tfidf using Logistic Regrssion - ', encoder.inverse_transform(pred3)[0], '\n\n')

Enter a text to categorize Otherwise press 'q' to exit:Foreign exchange differences (crediting)


Category based on Character level ngrams using Logistic Regrssion -  adjustment
Category based on count vector using Logistic Regrssion -  adjustment
Category based on tfidf using Logistic Regrssion -  adjustment 


Enter a text to categorize Otherwise press 'q' to exit:exceptional administrative expenses - EBT settlement


Category based on Character level ngrams using Logistic Regrssion -  adjustment
Category based on count vector using Logistic Regrssion -  adjustment
Category based on tfidf using Logistic Regrssion -  adjustment 


Enter a text to categorize Otherwise press 'q' to exit:Depreciation of intangible fixed assets - owned by the group


Category based on Character level ngrams using Logistic Regrssion -  d_and_a
Category based on count vector using Logistic Regrssion -  d_and_a
Category based on tfidf using Logistic Regrssion -  d_and_a 


Enter a text to categorize Otherwis