# Text Classification Model

Importing spacy and sklearn based packages

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

import spacy 
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English 
import string 

Loading the clustered dataframe from the unsupervised learning model.

In [5]:
df = pd.read_csv('data_clusters', index_col = 0)
df

Unnamed: 0,Text,Type,General Category,Word Count,Lexile,Named Entities,Polarity,subjectivity,Cluster
An Unlikely Parasite: The Mistletoe,"During the holidays, many people hang mistleto...",Nonfiction,Life Science,303.0,790.0,4,0.146667,0.354583,21
Thanksgiving: Fact or Fiction,This article is provided courtesy of History.c...,Nonfiction,U. S. History,1217.0,1460.0,179,0.119618,0.433465,2
Native American Conflicts,"Jamestown logo for World's Fair in 1907,Prior ...",Nonfiction,Geography& Societies,554.0,1340.0,65,0.105740,0.360446,11
A Monument for Peace,"In December, 1864, the Civil War was nearly ov...",Nonfiction,Arts& Culture,517.0,910.0,58,0.102357,0.381615,3
Pictures of the Year,People today take lots of photos. Many people ...,Nonfiction,Arts& Culture,206.0,730.0,14,0.230267,0.538781,4
...,...,...,...,...,...,...,...,...,...
Martin Luther King Jr.,Martin Luther King Jr. was a leader. When he w...,Nonfiction,U. S. History,87.0,470.0,8,0.294378,0.644082,0
Native American Powwows,Some Native Americans hold powwows today. Thes...,Nonfiction,Arts& Culture,96.0,660.0,19,0.071851,0.364643,4
What is a Talking Stick?,"Photo Credit: Library of Congress, ,For hundre...",Nonfiction,Arts& Culture,108.0,550.0,7,0.312500,0.500000,0
People Need the Ocean,People today could not live without the ocean....,Nonfiction,Geography& Societies,119.0,460.0,2,0.103409,0.484375,0


Cleaning the unprocessed text to avoid input bias. This classification model is primarily for future usage, to classify potential articles into one of the clusters based on text alone.

In [6]:
punctuations = string.punctuation
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

def spacy_tokenizer(sentence):
    
    textTokens = parser(sentence)
    textTokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in textTokens]
    textTokens = [word for word in textTokens if word not in stop_words and word not in punctuations]

    return textTokens

Creating a cleaning class and tokenizing the text. 

In [8]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower() 

bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

Using train_test_split() to split our data into training and model validation data. Using the clusters as the ylabel.

In [12]:
from sklearn.model_selection import train_test_split
X = df['Text'] 
ylabels = df['Cluster'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

Creating a pipeline with the cleaner, vectorizer, and classifier. Using a logicistic regression for cluster classification.  

In [14]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=5000)

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x000001F972DA5608>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x000001F966D07DC8>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_

In [17]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted, average = 'micro'))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted, average = 'micro'))

Logistic Regression Accuracy: 0.26535626535626533
Logistic Regression Precision: 0.26535626535626533
Logistic Regression Recall: 0.26535626535626533
