# NLTK Basic

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

## Web Scraping & Frequency Distribution

In [None]:
from bs4 import BeautifulSoup
import urllib

response = urllib.request.urlopen('http://php.net/')
html = response.read()
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text(strip=True)

tokenizer = RegexpTokenizer(r'\w+')
tokens= tokenizer.tokenize(text)
#tokens = word_tokenize(text)

In [None]:
freq = nltk.FreqDist(tokens)
count = 0
for key, val in freq.items():
    print("[{}] {}: {}".format(count+1, key, val))
    count += 1
    if count > 10:
        break
        
freq.plot(20, cumulative=False)

In [None]:
sw = stopwords.words('english')

clean_tokens = tokens[:]
for token in tokens:
    if token in sw:
        clean_tokens.remove(token)
        
freq = nltk.FreqDist(clean_tokens)
freq.plot(20, cumulative=False)

## Stemming and Lemmatization

In [None]:
stemmer = PorterStemmer()
print(stemmer.stem('increase'))

In [None]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('increases'))

# Keras IMDB

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

print(tf.__version__)

In [None]:
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
print('Training entries: {}, labels: {}'.format(len(train_data), len(train_labels)))

In [None]:
word_index = imdb.get_word_index()
word_index = { k:(v+3) for k, v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2 # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([value, key] for key, value in word_index.items())

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen=256)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen=256)

In [None]:
vocab_size = 10000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
x_val = train_data[:10000]
partial_x_train = train_data[10000:]

y_val = train_labels[:10000]
partial_y_train = train_labels[10000:]

history = model.fit(partial_x_train, partial_y_train,
                    epochs=10, batch_size=512,
                    validation_data=(x_val, y_val))

In [None]:
results = model.evaluate(test_data, test_labels)
print(results)

# BQ Analysis

# Panda - Read Excel into DataFrame

In [None]:
# Dependencies for this session
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.pipeline import Pipeline

In [None]:
# Read Excel
#df_original = pd.read_excel('BQ Analysis for Python.xlsx', sheet_name='Project BQ')
#df_original.columns = ['Contract', 'BqRef', 'BqTrade', 'BqSubTrade', 'Heading', 'SubHeading', 
#              'ItemHeading', 'ItemDesc', 'Qty', 'UOM', 'AcCostID', 'AcTrade', 'AcSubTrade']

In [None]:
#df_original.to_csv('data.csv', index=False)

In [None]:
df_original = pd.read_csv('data.csv')
df_original.columns = ['Contract', 'BqRef', 'BqTrade', 'BqSubTrade', 'Heading', 'SubHeading', 
              'ItemHeading', 'ItemDesc', 'Qty', 'UOM', 'AcCostID', 'AcTrade', 'AcSubTrade']

In [None]:
# Fill Missing Data
df = df_original.copy()
# Skip some troublesome Trades
#df = df[(df.AcTrade != 'IP') & (df.AcTrade != 'SU') & (df.AcTrade != 'DA') & (df.AcTrade != 'DA') & (df.AcTrade != 'AS')]
df = df.fillna(' ')

## Combine columns into "Super Description"

In [None]:
#df['SuperDesc'] = df.BqTrade + " " + df.Heading + " " + df.SubHeading + " " + df.ItemHeading + " " + df.ItemDesc
df['SuperDesc'] = df.BqTrade + ' ' + df.Heading + ' ' + df.ItemHeading + ' ' + df.ItemDesc
print(df.SuperDesc.isnull().sum())

## Encode Target Variable

<b>Categorical to Numerical transformation</b>

In [None]:
#df.AcTrade.astype('category')
df['AcTradeCode'] = pd.Categorical(df.AcTrade).codes
ac_trade_categories = pd.Categorical(df.AcTrade)
#print(ac_trade_categories)

In [None]:
ac_trade_categories.categories

## Train Test Splitting

In [None]:
# Train Test with full population
x = df.SuperDesc
y = df.AcTradeCode

from sklearn.utils import shuffle
x, y = shuffle(x, y, random_state=99)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99)

In [None]:
# Test Set Manipulation
x_test = x
y_test = y

In [None]:
set(df.Contract)

In [None]:
# Train Test with Target Prject
from sklearn.utils import shuffle

target_contract = 'FL49'
x_train = df[df.Contract != target_contract].SuperDesc.values
y_train = df[df.Contract != target_contract].AcTradeCode.values

x_train, y_train = shuffle(x_train, y_train, random_state=99)

x_test = df[df.Contract == target_contract].SuperDesc.values
y_test = df[df.Contract == target_contract].AcTradeCode.values

## Model Training - Support Vector Machine

In [None]:
## Model Training - Support Vector Machine
text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2), stop_words='english', analyzer='word', binary=True)),
    #('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-5, random_state=99, max_iter=200, tol=1e-6))
    #('clf', RandomForestClassifier(n_estimators=100, max_depth=20, random_state=99))
    ('clf', LinearSVC())
])
text_clf.fit(x_train, y_train)

y_pred = text_clf.predict(x_test)
print(np.mean(y_pred == y_test)) # Score
print((y_pred != y_test).sum()) # Count of incorrect predictions

In [None]:
y_report_set = set(y_test).union(set(y_pred))
y_report_names = ac_trade_categories.categories[list(y_report_set)]
print(metrics.classification_report(y_test, y_pred, target_names=y_report_names))

In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_score
cross_val_score(text_clf, x_train, y_train, cv=5)

In [None]:
dict(enumerate(pd.Categorical(df.AcTrade).categories))

In [None]:
df['Pred'] = [ac_trade_categories[i] for i in y_pred]
df_result = df[df.AcTrade != df.Pred]
print(df_result)
#df_result.to_excel('result.xlsx')

### Model Training - Naive Bayes

In [None]:
# Model Fitting
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])
text_clf.fit(x_train, y_train)

# Evaluation
y_pred = text_clf.predict(x_test)
np.mean(y_pred == y_test)

In [None]:
print(metrics.classification_report(y_pred, y, target_names=ac_trade_categories))

## Model Training - Multinomial Naive Bayes (Verbose)

In [None]:
count_vect = CountVectorizer()
x_train_count = count_vect.fit_transform(x_train)
x_train_count.shape

In [None]:
text = 'concrete grade sign way'
temp = [count_vect.vocabulary_.get(i) for i in text.split()]
print(temp)

In [None]:
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_count)
x_train_tfidf.shape

In [None]:
# Model Training
clf = MultinomialNB().fit(x_train_tfidf, y_train)

In [None]:
# Model Evaluation
x_test_count = count_vect.transform(x_test)
x_test_tfidf = tfidf_transformer.transform(x_test_count)

y_pred = clf.predict(x_test_tfidf)
np.mean(y_pred == y_test)