# Machine Learning - Pipeline Customization

## Import Libraries

In [3]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion, Pipeline

import nltk
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gabriele/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/gabriele/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/gabriele/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Import Data

In [4]:
categories = [
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'alt.atheism',
 'soc.religion.christian',
]

dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame(dataset.data, columns=["corpus"]).sample(10)

## Text Processing

In [5]:
def preprocess_text(text: str, remove_stopwords: bool) -> str:
    text = re.sub(r"http\S+", "", text) # removes link
    text = re.sub("[^A-Za-z]+", " ", text) # removes numbers and symbols
    if remove_stopwords: # removes stopwords
        tokens = nltk.word_tokenize(text)
        tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
        text = " ".join(tokens)
    text = text.lower().strip() # removes spaces and apply lower case
    return text


def get_sentiment(text: str):
  vader = SentimentIntensityAnalyzer()
  return vader.polarity_scores(text)['compound']

def get_nchars(text: str): # returns the length of a string
  return len(text)

def get_nsentences(text: str): # returns number of words in a text
  return len(text.split("."))

## Transformers

In [6]:
class DummyTransformer(BaseEstimator, TransformerMixin):
  def __init__(self):
    return None

  def fit(self, X=None, y=None):
    return self

  def transform(self, X=None):
    return self

In [7]:
class Preprocessor(DummyTransformer):
  def __init__(self, remove_stopwords: bool):
    self.remove_stopwords = remove_stopwords
    return None
  def transform(self, X=None):
    preprocessed = X.apply(lambda x: preprocess_text(x, self.remove_stopwords)).values
    return preprocessed

class SentimentAnalysis(DummyTransformer):
  def transform(self, X=None):
    sentiment = X.apply(lambda x: get_sentiment(x)).values
    return sentiment.reshape(-1, 1) # <-- da notare il reshape per trasformare un vettore riga in uno colonna

class NChars(DummyTransformer):
  def transform(self, X=None):
    n_chars = X.apply(lambda x: get_nchars(x)).values
    return n_chars.reshape(-1, 1)

class NSententences(DummyTransformer):
  def transform(self, X=None):
    n_sentences = X.apply(lambda x: get_nsentences(x)).values
    return n_sentences.reshape(-1, 1)

class FromSparseToArray(DummyTransformer):
  def transform(self, X=None):
    arr = X.toarray()
    return arr

In [8]:
vectorization_pipeline = Pipeline(steps=[
    ('preprocess', Preprocessor(remove_stopwords=True)),
    ('tfidf_vectorization', TfidfVectorizer()),
    ('arr', FromSparseToArray()),
    ])
# preprocess the text -> tfidf vectorization -> transformation of the vectorization into an array (so that we can put it into a dataframe)

In [9]:
features = [
  ('vectorization', vectorization_pipeline), # vectorization of the text into a dataframe
  ('sentiment', SentimentAnalysis()), # creation of the sentiment analysis feature
  ('n_chars', NChars()), # creation of the nchar feature
  ('n_sentences', NSententences()) # creation of the nsentences feature
]
combined = FeatureUnion(features)
combined

In [10]:
combined.fit_transform(X=df['corpus'])

array([[1.18218802e-01, 1.18218802e-01, 0.00000000e+00, ...,
        9.49500000e-01, 6.10000000e+02, 5.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        7.09600000e-01, 9.34000000e+02, 1.00000000e+01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        9.78600000e-01, 6.93000000e+02, 7.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 1.15529689e-01, ...,
        9.72400000e-01, 4.06000000e+02, 5.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.15900000e-01, 8.77000000e+02, 2.30000000e+01],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.73200000e-01, 9.76000000e+02, 5.00000000e+00]])

In [11]:
cols = list(vectorization_pipeline.steps[1][1].get_feature_names_out())+ ["sentiment", "n_chars", "n_sentences"]
features_df = pd.DataFrame(combined.transform(df['corpus']), columns=cols)

In [12]:
# features_df.iloc[:, -6:] # truncated output
features_df.head()

Unnamed: 0,absurd,act,actually,adams,administrators,admittedly,adult,agreed,allows,almost,...,worthy,would,wrong,yet,york,young,youth,sentiment,n_chars,n_sentences
0,0.118219,0.118219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.236438,...,0.0,0.07817,0.0,0.0,0.0,0.0,0.0,0.9495,610.0,5.0
1,0.0,0.0,0.0,0.0,0.0,0.103508,0.0,0.103508,0.103508,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7096,934.0,10.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.098062,0.194524,0.0,0.0,0.0,0.0,0.0,0.9786,693.0,7.0
3,0.0,0.0,0.0,0.127124,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.084058,0.0,0.108067,0.0,0.0,0.0,0.0516,447.0,15.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.403194,0.0,0.0,0.0,68.0,1.0
