<img src="https://bdaaosu.org/img/Logo.png" width="60%"/>

# Text Classification Workshop

In [None]:
# Helper function
def most_associated_words(tfidf, features, labels):
    from sklearn.feature_selection import chi2
    import numpy as np
    N = 5
    for cond in list(set(labels)):
      features_chi2 = chi2(features, labels == cond)
      indices = np.argsort(features_chi2[0])
      feature_names = np.array(tfidf.get_feature_names())[indices]
      unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
      bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
      #trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
      print("# '{}':".format(cond))
      print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
      print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
      #print("  . Most correlated trigrams:\n. {}".format('\n. '.join(trigrams[-N:])))
      print('\n')

## How important is a word?

In [None]:
# Two example "documents"
example = [
    'Perfectly cooked and seasoned',
    'Illustrious and a symbol of strength'
]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer # Import the "TfidfVectorizer" from scikit-learn
import pandas as pd # Allows us to view data as a nicely formatted table!

# Make a TF-IDF vectorizer object
tfidf = TfidfVectorizer( 
    lowercase=True, # Make every word in our documents lowercase
    stop_words='english' # Remove common words like "a", "is", "they", "with", etc.
)

# Transform the "documents" we have to a matrix of TF-IDF values
features = tfidf.fit_transform(example).toarray()

print(
    # Follow along!
)

### Let's put on our math hats

Why does the word "illustrious" have a TF-IDF value of 0.57735?

In [81]:
import numpy as np

num_documents = 2
num_words = 3
term_frequency = 1/num_words
inverse_document_frequency = np.log(num_documents/1)+1

tf_idf = (1/3)*(np.log(2)+1)
print(tf_idf)

0.5643823935199818


### Okay, _cool math dude_, so now we have a huge matrix. 
### So, how do we use it to make classifications?

# What the f@!k is Multinomial Naive Bayes?

## Enter Bayes Rule
<img src="https://miro.medium.com/max/512/0*EfYTXtTJ9X-Ua9Nh.png" />

## From our first (small) example
<img src="https://i.imgur.com/WnKCeD3.png" width="80%"/>

## Food or not food?
<img src="https://i.imgur.com/a83Evsd.png" width="55%"/>

## We're going to need some training data...

## Why not classify some listings together?

<img src="https://i.imgur.com/qXcsZPi.png" width="50%"/>

<center><h2>go.osu.edu/bdaa_ctc</h2></center>

## Let's do some classification

In [87]:
# Get the classifications we came up with!
descriptions = pd.read_csv('https://bdaa-text-workshop.s3.amazonaws.com/iPhone+Listing+Descriptions.csv')

In [None]:
# What's the distribution of the condition classes we came up with?
# Follow along!

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    min_df=.05, 
    lowercase=True,
    ngram_range=(1, 2), # Consider both one word and two word combinations
    stop_words='english'
)

features = tfidf.fit_transform(descriptions.ItemDescription).toarray()
labels = descriptions.Condition

In [None]:
# Get most associated words with each condition category 
# Follow along!

## Train, train, train

In [98]:
from sklearn.model_selection import train_test_split # Split a dataset into training and test datasets
from sklearn.feature_extraction.text import CountVectorizer # Make a matrix of word counts
from sklearn.feature_extraction.text import TfidfTransformer # Transform a matrix of word counts into TF-IDF values
from sklearn.naive_bayes import MultinomialNB # Make a Multinomial Naive Bayes model

# Make training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    descriptions['ItemDescription'], 
    descriptions['Condition'], 
    random_state = 0
)

# Transform our training data into word counts
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

# ..and then TF-IDF values
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Train the Multinomial Naive Bayes model!
clf = MultinomialNB().fit(X_train_tfidf, y_train)

## Classify the real data!

In [None]:
# We need some value data!
iphone_listings = pd.read_csv('https://bdaa-text-workshop.s3.amazonaws.com/eBay+iPhone+Listings.csv', error_bad_lines=False)

In [None]:
# What does the data look like?
# Follow along!

In [None]:
print(
    # Use the model we trained to predict labels!
    clf.predict(
        # Transform the listing descriptions into a matrix of counts
        count_vect.transform(
            iphone_listings.head(100).ItemDescription.tolist()
        )
    )
)

In [102]:
# Make a new column in the real data with the predicted Condition
iphone_listings['Condition'] = clf.predict(
        # Transform the listing descriptions into a matrix of counts
        count_vect.transform(
            iphone_listings.ItemDescription.tolist()
        )
    )

In [None]:
# How many of each class did we predict?
# Follow along!

# Who'se got a used iPhone?