# Tutorial - Text Mining - Classification - SCIKIT-LEARN - PIPELINE

We will predict the category of discussion posts in a newsgroup.

**The unit of analysis is a discussion post**

In [None]:
import pandas as pd
import numpy as np

In [None]:
news = pd.read_csv('news.csv')

In [None]:
news.head(5)

## Assign the "target" variable

This is a multi-class classification problem. There are three categories we will predict:<br>
Whether a post is "graphics," "hockey," or "medical" related

In [None]:
target = news['newsgroup']

## Assign the "text" (input) variable

In [None]:
# Check for missing values

news[['TEXT']].isna().sum()

In [None]:
input_data = news[['TEXT']]

## Split the data

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set, train_y, test_y = train_test_split(input_data, target, test_size=0.3, random_state=42)

In [None]:
train_set.shape, train_y.shape

In [None]:
test_set.shape, test_y.shape

# Data Prep

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.preprocessing import FunctionTransformer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

## Sklearn: Text preparation

### Step 1:
We need to create the term by document matrix. We'll use sklearn's TfidfVectorizer, which creates this matrix using the TFIDF metric. <br>
TfidfVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

If you don't use the TfidfVectorizer, you have to do all the text prep on your own:<br>
1- Convert to lowercase<br>
2- Remove numbers (if needed)<br>
3- Remove punctuation<br>
4- Remove whitespace<br>
5- Tokenize<br>
6- Calculate TFIDF<br>
etc.

### Step 2:
We need to reduce the dimensionality (i.e., the number of columns) by creating SVDs. We'll use sklearn's Truncated SVD to do this.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

## Caveat for creating a pipline for text columns

**TfidfVectorizer requires the text data to be a one-dimensional list/array. That's why we need a function to convert the dataframe column to a one-dimensional array.**

In [None]:
def new_col(df):
    #Create a copy so that we don't overwrite the existing dataframe
    df1 = df.copy()
    
    # First, conver the dataframe column to a numpy array. Then, call the ravel function to make it one-dimensional
    return np.array(df).ravel()

In [None]:
new_col(train_set)

##  Identify the text column

In [None]:
text_column = ['TEXT']

# Pipeline

In [None]:
number_svd_components = 300

In [None]:
text_transformer = Pipeline(steps=[
                ('my_new_column', FunctionTransformer(new_col)),
                ('text', TfidfVectorizer(stop_words='english')),
                ('svd', TruncatedSVD(n_components=number_svd_components, n_iter=10))
            ])

In [None]:
preprocessor = ColumnTransformer([
                     ('text', text_transformer, text_column),
                    ],
        remainder='drop')

#passtrough is an optional step. You don't have to use it.

# Transform: fit_transform() for TRAIN

In [None]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_set)

train_x

In [None]:
train_x.shape

# Tranform: transform() for TEST

In [None]:
# Transform the test data
test_x = preprocessor.transform(test_set)

test_x

In [None]:
test_x.shape

# Baseline

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")

dummy_clf.fit(train_x, train_y)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#Baseline Train Accuracy
dummy_train_pred = dummy_clf.predict(train_x)

baseline_train_acc = accuracy_score(train_y, dummy_train_pred)

print('Baseline Train Accuracy: {}' .format(baseline_train_acc))

In [None]:
#Baseline Test Accuracy
dummy_test_pred = dummy_clf.predict(test_x)

baseline_test_acc = accuracy_score(test_y, dummy_test_pred)

print('Baseline Test Accuracy: {}' .format(baseline_test_acc))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 

from sklearn.metrics import accuracy_score

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=100, max_leaf_nodes=16, n_jobs=-1) 

rnd_clf.fit(train_x, train_y)



## Accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#Train accuracy

train_y_pred = rnd_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

In [None]:
#Test accuracy

test_y_pred = rnd_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)

## Stochastic Gradient Descent Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=100)


In [None]:
sgd_clf.fit(train_x, train_y)

## Accuracy

In [None]:
#Train accuracy

train_y_pred = sgd_clf.predict(train_x)

train_acc = accuracy_score(train_y, train_y_pred)

print('Train acc: {}' .format(train_acc))

In [None]:
#Test accuracy

test_y_pred = sgd_clf.predict(test_x)

test_acc = accuracy_score(test_y, test_y_pred)

print('Test acc: {}' .format(test_acc))

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

#Usually created on test set
confusion_matrix(test_y, test_y_pred)

# Explore the SVDs - OPTIONAL

In [None]:
# Let's retrieve the Truncated SVD from the column transformer
# We must do chain indexing
# "preprocessor" has "transformers_" attribute
# We must retrieve all transformers with an index value of 0
# Then, we must retrieve the "text" transformer with an index value of 1
# Then, we must retrieve the "svd" transformer with an index value of 2.

svd = preprocessor.transformers_[0][1][2]

svd

In [None]:
# Now, retrieve the varience explained and sum them

svd.explained_variance_.sum()

In [None]:
#These are the all the components:
svd.components_

In [None]:
svd.components_.shape

In [None]:
#Let's select the first component:

first_component = svd.components_[0,:]

first_component

In [None]:
# Sort the weights in the first component, and get the indeces

indeces = np.argsort(first_component).tolist()

In [None]:
#Be careful, indeces are in descending order (least important first)

print(indeces)

In [None]:
#Let's get the feature names from the count vectorizer:
# First, we need to retrieve the TfIDFVectorizer from the column transformer

tfidf = preprocessor.transformers_[0][1][1]

tfidf

In [None]:
# Now, get the feature names

feat_names = tfidf.get_feature_names_out()

In [None]:
#Print the last 10 terms (i.e., the 10 terms that have the highest weigths)

for index in indeces[-10:]:
    print(f'term: {feat_names[index]}\t weight = {first_component[index]}')