In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  How to create a Dask DataFrame from an array.
#     .  Introduction to  sklearn.feature_extraction.text import CountVectorizer


In [None]:

NUM_PARTITIONS  = 3

print("--")


In [None]:

import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


In [45]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

print("--")


--


In [None]:

#  Create our source Dask DataFrame from an array.
#     (We use this later, briefly, for comparisons.)

np_airports = np.array([
       #
   ["MKE", "Milwaukee"     , "WI", "Airport"],
   ["ORD", "Chicago O-Hare", "IL", "Airport"],
   ["SJC", "San Jose"      , "CA", "Airport"],
   ["LAX", "Los Angeles"   , "CA", "Airport"],
   ["DEN", "Denver"        , "CO", "Airport"],
       #
   ], dtype="str")

pd_airports = pd.DataFrame(np_airports, columns = ["airport_code", "airport_name", "state_code", "LABEL"])
   #
dd_airports = from_pandas(pd_airports, npartitions = NUM_PARTITIONS)


for l_each in dd_airports.itertuples():
   print("Airport:  %3s   %-18s   %-2s   %-10s" % (l_each.airport_code, l_each.airport_name, l_each.state_code, l_each.LABEL))

print("--")

#  Sample output,
#
#  Airport:  MKE   Milwaukee            WI   Airport   
#  Airport:  ORD   Chicago O-Hare       IL   Airport   
#  Airport:  SJC   San Jose             CA   Airport   
#  Airport:  LAX   Los Angeles          CA   Airport   
#  Airport:  DEN   Denver               CO   Airport  


#  Introduction to CountVectorizer

In [None]:

#  Working with  sklearn.feature_extraction.text.CountVectorizer
#
#     See,
#        https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#tokenizing-text-with-scikit-learn


#  .  The results array is sorted by input key value.
#        ..  So below, baseball occupies the output's zero'th position,
#            where cricket will occupy the 1'st.
#  .  Words will be automatically split on whitespace, other.
#        ..  So I plan to do my own splitting beforehand, lest I not know what the output array represents.
#               O-Hare          was not split
#               football-helmet was split
#  .  Duplicates do receive multiple entries in the output. Not certain of the effect of that.
#
#  .  2d integer64 array is output;
#        ..  1st col is a counter, index into array
#        ..  2nd col is for each word in the array, and the reference to 1st col, this word's position in the input
#

my_cv = CountVectorizer()

my_input_arr = [ 
   "baseball",
   "football",
   "cricket",
   "golf",
   "racing",
   "fencing",
   "cricket",
]


my_cv_counts = my_cv.fit_transform(my_input_arr)
   #
#  my_cv_counts = my_cv.fit_transform(dd_airports.airport_code)
#  my_cv_counts = my_cv.fit_transform(dd_airports.airport_name)


print(type(my_cv_counts))                  #  <class 'scipy.sparse._csr.csr_matrix'>

                                           #  For my_input_arr above
                                           #  -----------------------------------
print(my_cv_counts.shape)                  #  (7, 6)
print(my_cv_counts      )                  #  (0, 0)    1
                                           #  (1, 3)    1
                                           #  (2, 1)    1
                                           #  (3, 4)    1
                                           #  (4, 5)    1
                                           #  (5, 2)    1
                    
                                           #  For dd_airports.airport_code above
                                           #  -----------------------------------
                                           #  (5, 5)
                                           #  (0, 2)    1
                                           #  (1, 3)    1
                                           #  (2, 4)    1
                                           #  (3, 1)    1
                                           #  (4, 0)    1
 
                                           #  For dd_airports.airport_name above
                                           #  -----------------------------------
                                           #  (5, 8)
                                           #  (0, 6)    1
                                           #  (1, 1)    1
                                           #  (1, 3)    1
                                           #  (2, 7)    1
                                           #  (2, 4)    1
                                           #  (3, 5)    1
                                           #  (3, 0)    1
                                           #  (4, 2)    1
        
print("")

df_words = pd.DataFrame(my_cv_counts.toarray())
   #
for l_each in df_words.iterrows():
   print(l_each)                 


#  For my_input_arr above  (printed as it is output)
# 
#  (0, 0    1 1    0 2    0 3    0 4    0 5    0 Name: 0, dtype: int64)
#  (1, 0    0 1    0 2    0 3    1 4    0 5    0 Name: 1, dtype: int64)
#  (2, 0    0 1    1 2    0 3    0 4    0 5    0 Name: 2, dtype: int64)
#  (3, 0    0 1    0 2    0 3    0 4    1 5    0 Name: 3, dtype: int64)
#  (4, 0    0 1    0 2    0 3    0 4    0 5    1 Name: 4, dtype: int64)
#  (5, 0    0 1    1 2    1 3    0 4    0 5    0 Name: 5, dtype: int64)
#  (6, 0    0 1    1 2    0 3    0 4    0 5    0 Name: 6, dtype: int64)

#  Above better formatted as,
#
#  (0,    0 1    1 0    2 0    3 0   4 0   5 0   Name: 0, dtype: int64)
#  (1,    0 0    1 0    2 0    3 1   4 0   5 0   Name: 1, dtype: int64)
#  (2,    0 0    1 1    2 0    3 0   4 0   5 0   Name: 2, dtype: int64)
#  (3,    0 0    1 0    2 0    3 0   4 1   5 0   Name: 3, dtype: int64)
#  (4,    0 0    1 0    2 0    3 0   4 0   5 1   Name: 4, dtype: int64)
#  (5,    0 0    1 1    2 1    3 0   4 0   5 0   Name: 5, dtype: int64)
#  (6,    0 0    1 1    2 0    3 0   4 0   5 0   Name: 6, dtype: int64)
#
#   A     B C    B C    B C   ......
#
#  So,
#     A   == row number, offset into the array, 0-6 (7) total rows   from my_input_arr
#     B   == col number, offset inside the row, 0-5 (6) unique words from my_input_arr
#     C   ==  1|0  is this keyword  0-5 (6)  found in this row  0-6 (7)
#
#  So, if you had  1000  input records times  20  unique words, the array would be  1000x20
#
#
#  If the value of row-2 was (football, cricket, golf), its entry would appear as,
# 
#  (1,    0 0    1 1    2 0    3 1   4 1   5 0 Name: 1, dtype: int64)
#
#     Recall that the unique keywords sort as; (0)baseball (1)cricket (2)fencing (3)football (4)golf (5)racing


print("--")



#  A more complete example

In [40]:

#  More complete [ training ] data, as required ..
#
#     .  data[]       -- Our records, emails, nodes in a graph, .. in this case, a simple word
#     .  category[]   -- Our unique list of node labels, for example
#     .  target[]     -- Matches data[] above, the code of the node label.

my_train = {}

my_train["data"] = [ 
   "baseball" ,
   "football" ,
   "cricket"  ,
   "golf"     ,
   "racing"   ,
   "fencing"  ,
      #
   "eggs"     ,
   "bread"    ,
   "cheese"   ,
   "wine"     ,
   ]
   #
my_train["category"] = [
   "sport",
   "food" ,
   ]


my_train["data"].sort()                #  We could have entered the data pre-sorted
my_train["category"].sort()            #  Here, just reminding us that we should sort.


my_train["target"] = [1, 0, 0, 1, 0, 1, 1, 1, 1, 0]

   ###
    
for l_index, l_zip in enumerate(zip(my_train["data"], my_train["target"])):
   print("Data: %-18s   Category: %s" % (l_zip[0], my_train["category"][l_zip[1]]) )

#  Sample output,
#
#     Data: baseball             Category: sport
#     Data: bread                Category: food
#     Data: cheese               Category: food
#     Data: cricket              Category: sport
#     Data: eggs                 Category: food
#     Data: fencing              Category: sport
#     Data: football             Category: sport
#     Data: golf                 Category: sport
#     Data: racing               Category: sport
#     Data: wine                 Category: food


Data: baseball             Category: sport
Data: bread                Category: food
Data: cheese               Category: food
Data: cricket              Category: sport
Data: eggs                 Category: food
Data: fencing              Category: sport
Data: football             Category: sport
Data: golf                 Category: sport
Data: racing               Category: sport
Data: wine                 Category: food


In [43]:

my_cv = CountVectorizer()

#  Same as we did before/above
#
my_cv_counts = my_cv.fit_transform(my_train["data"])


#  New: Now normalize fact that longer (documents/Nodes/other) have more words and would get unfair weights
#       term frequency / inverse document frequency
#       (Tf-idf)
#
#       Effectively, reduce the weight of words that occur in more (documents/Nodes/other),
#          in favor of words that occur in fewer (documents/Nodes/other)

my_tf_transformer = TfidfTransfotmer

