Working With Text Data
===

- https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

In [3]:
import pandas as pd
import numpy as np
import joblib

In [10]:
#urls = 'https://github.com/dragon-library/work_space/raw/main/HS_Code/HS/hs_code.xlsx'
urls = 'data/hs_code.xlsx'
types = 'section'


def get_master(sheets, types='section'):
    data = pd.read_excel(urls, sheet_name=sheets)
    data[types] = data[types].map('{:02}'.format)
    data = data[[types, 'description']]
    data['description'] = data['description'].str.lower()
    #   data = data.rename(columns={'heading' : 'target', 'product_desc' : 'question_text'})

    return data


def manage_data(df):
    df.columns = ['target', 'data']

    return df

In [5]:
import joblib
files = "data/train_section.pkl"
print("Load DataFrame")
twenty_train = joblib.load(files)
#df.columns =  ['Conference', 'Title'] 


twenty_train.head()

Load DataFrame


Unnamed: 0,target,data
0,1,"horses; live, purebred breeding animals - pure..."
1,1,"horses; live, other than purebred breeding ani..."
2,1,asses; live - other
3,1,mules and hinnies; live- other
4,1,"cattle; live, purebred breeding animals - pure..."


In [6]:
from collections import Counter
Counter(twenty_train["target"])

Counter({'01': 1721,
         '02': 1487,
         '03': 327,
         '04': 2208,
         '05': 691,
         '06': 5045,
         '07': 2868,
         '08': 483,
         '09': 745,
         '10': 901,
         '11': 4048,
         '12': 394,
         '13': 992,
         '14': 590,
         '15': 10406,
         '16': 8452,
         '17': 5134,
         '18': 1470,
         '19': 89,
         '20': 1053,
         '21': 33})

In [11]:
types = "section"


sheets = "deci"
deci = get_master(sheets,types)



twenty_test = manage_data(deci)

print(len(twenty_test.data))
twenty_test.head()

23344


Unnamed: 0,target,data
0,17,porsche 911 carrera 2020 2981cc:385hp:n/a benz...
1,6,95569-07700-10 eaa hyaluron moist.booster 2y\r...
2,6,95569-07700-10 eaa hyaluron moist.booster 2y\r...
3,6,96845-07700-11 euc.int calm 2y\r\n(eucerin ins...
4,6,96845-07700-11 euc.int calm 2y\r\n(eucerin ins...


In [12]:
from collections import Counter
Counter(twenty_test["target"])

Counter({'17': 4,
         '06': 19,
         '18': 78,
         '16': 19956,
         '04': 90,
         '20': 2,
         '07': 1196,
         '10': 131,
         '15': 1568,
         '05': 53,
         '13': 207,
         '11': 26,
         '09': 13,
         '03': 1})

In [4]:
 twenty_train.target.unique()

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '00'],
      dtype=object)

In [5]:
 twenty_test.target.unique()

array(['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11',
       '12', '13', '14', '15', '16', '17', '18', '19', '20', '21'],
      dtype=object)

In [13]:

category =  twenty_train.target.unique()

twenty_train.target_names =  twenty_train.target.unique()
twenty_test.target_name =  twenty_test.target.unique()

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [15]:
len(twenty_train.data)

49137

In [16]:
twenty_train.target[:10]

0    01
1    01
2    01
3    01
4    01
5    01
6    01
7    01
8    01
9    01
Name: target, dtype: object

In [17]:
for t in twenty_train.target[:10]:
    a = int(t)
    print(twenty_train.target[a])

01
01
01
01
01
01
01
01
01
01


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
>>> count_vect = CountVectorizer()
>>> X_train_counts = count_vect.fit_transform(twenty_train.data)
>>> X_train_counts.shape

(49137, 32497)

In [20]:
count_vect.vocabulary_.get(u'algorithm')

In [21]:
>>> from sklearn.feature_extraction.text import TfidfTransformer
>>> tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
>>> X_train_tf = tf_transformer.transform(X_train_counts)
>>> X_train_tf.shape

(49137, 32497)

In [22]:
>>> tfidf_transformer = TfidfTransformer()
>>> X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
>>> X_train_tfidf.shape

(49137, 32497)

In [23]:
>>> from sklearn.naive_bayes import MultinomialNB
>>> clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [24]:
>>> docs_new = ["Textile fabrics","Horse"]
>>> X_new_counts = count_vect.transform(docs_new)
>>> X_new_tfidf = tfidf_transformer.transform(X_new_counts)

>>> predicted = clf.predict(X_new_tfidf)

>>> for doc, category in zip(docs_new, predicted):
...     print('%r => %s' % (doc, twenty_train.target_names[int(category)]))

'Textile fabrics' => 12
'Horse' => 16


In [25]:
>>> from sklearn.pipeline import Pipeline
>>> text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', MultinomialNB()),
... ])

In [26]:
text_clf.fit(twenty_train.data, twenty_train.target)


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [27]:
>>> import numpy as np

>>> docs_test = twenty_test.data
>>> predicted = text_clf.predict(docs_test)
>>> np.mean(predicted == twenty_test.target)

0.6231579849211789

In [34]:
>>> from sklearn.linear_model import SGDClassifier
>>> text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', SGDClassifier(loss='hinge', penalty='l2',
...                           alpha=1e-3, random_state=42,
...                           max_iter=5, tol=None)),
... ])

>>> text_clf.fit(twenty_train.data, twenty_train.target)

>>> predicted = text_clf.predict(docs_test)
>>> np.mean(predicted == twenty_test.target)

0.5909869773817683

In [30]:
>>> from sklearn import metrics
>>> print(metrics.classification_report(twenty_test.target, predicted))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          01       0.00      0.00      0.00         0
          02       0.00      0.00      0.00         0
          03       0.00      0.00      0.00         1
          04       0.57      0.28      0.37        90
          05       0.00      0.00      0.00        53
          06       0.02      0.47      0.04        19
          07       0.24      0.29      0.26      1196
          09       0.00      0.00      0.00        13
          10       0.01      0.05      0.02       131
          11       0.25      0.81      0.39        26
          12       0.00      0.00      0.00         0
          13       0.73      1.00      0.84       207
          14       0.00      0.00      0.00         0
          15       0.21      0.74      0.33      1568
          16       0.94      0.60      0.73     19956
          17       0.00      0.75      0.00         4
          18       0.20      0.28      0.23        78
          20       0.00    

In [31]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     1,     0,     0,     0],
       [    1,     1,     0,    25,     2,    28,     0,     2,     0,
            0,     0,     0,     5,     0,    20,     6,     0,     0],
       [    0,     0,     5,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,    48,     0,     0,     0],
       [    0,     0,     0,     0,     0,     9,     0,     0,     0,
            0,     0,     0,     0,     0,     8,     2,     0,     0],
       [    0,     0,    11,     0,     0,    97,   350,     0,     0,
            3,     0,     0,     0,   225,   415,    70,    25,     0],

In [32]:
>>> from sklearn.model_selection import GridSearchCV
>>> parameters = {
...     'vect__ngram_range': [(1, 1), (1, 2)],
...     'tfidf__use_idf': (True, False),
...     'clf__alpha': (1e-2, 1e-3),
... }

In [52]:
txt = ["Textile fabrics","Horse"]

docs_test = txt
predicted = text_clf.predict(docs_test)

predicted 


array(['11', '06'], dtype='<U2')

In [51]:
text_clf.fit(twenty_train.data, twenty_train.target)

predicted = text_clf.predict(txt)
predicted

array(['11', '10'], dtype='<U2')

In [33]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
twenty_train.target[gs_clf.predict(['Textile fabrics'])[0]]

KeyError: '11'

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
twenty_train.target = gs_clf.predict(['Textile fabrics'])

In [None]:
gs_clf.best_score_

In [None]:
for param_name in sorted(parameters.keys()):
...     print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))