#Import lib

In [None]:
import joblib
import pandas as pd
import numpy as np

import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions and classes

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stemmer = nltk.PorterStemmer()

def remove_urls(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

class RemoveUrlTrasformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_urls(x))
        return X_


def remove_numbers(data):
    return re.sub('[0-9]+', '', data)

class RemoveNumbersTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_numbers(x))
        return X_


def remove_usernames(data):
    return re.sub('@\w+', '', data)

class RemoveUserNamesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_usernames(x))
        return X_


def remove_punctuation(text):
    cleaned = ''.join([word for word in text if word not in string.punctuation])
    return cleaned

class RemovePunctuationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_punctuation(x))
        return X_


def tokenizer(text):
    #change to small letters
    lower_text = text.lower()
    #Tokenize
    tokenized_text = nltk.word_tokenize(lower_text)
    return tokenized_text
    
class TokenizationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: tokenizer(x))
        return X_


def remove_stopwords(text):
    without_stopwords = [word for word in text if word not in stopwords]
    return without_stopwords

class RemoveStopWordsTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_stopwords(x))
        return X_


def remove_short_tokens(text):
    without_short_tokens = [word for word in text if len(word) > 2]
    return without_short_tokens

class RemoveShortTokensTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: remove_short_tokens(x))
        return X_


def stemming(text):
    stemwords = [stemmer.stem(word) for word in text]
    return stemwords

class StemmingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: stemming(x))
        return X_


def return_string(text):
    str_text = ' '.join(text)
    return str_text

class Return_String_Transformer(BaseEstimator, TransformerMixin):
    def __init__(self, text_column):
        self.text_column = text_column
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_ = X.copy()
        X_[self.text_column] = X_[self.text_column].apply(lambda x: return_string(x))
        return X_

#Simple dataframe to test

In [None]:
new_X = pd.DataFrame(np.array([["I am very happy"]]))
new_X.columns = ['text']
new_X

Unnamed: 0,text
0,I am very happy


#Model 1 to test

In [None]:
decision_tree = joblib.load('/content/drive/MyDrive/Colab_Notebooks/PROJEKT_GRUPA_3/models_joblib/baseline_classifier.pkl')

In [None]:
predictions = decision_tree.predict(new_X)
predictions

array([1])

#Model 2 to test

In [None]:
Log_reg = joblib.load('/content/drive/MyDrive/Colab_Notebooks/PROJEKT_GRUPA_3/models_joblib/logistic_regression_5.pkl')

In [None]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    max_features = 10000,  
    ngram_range=(1, 3), 
    stop_words='english'
    )

new_X_vec = vectorizer.fit_transform(new_X)

In [None]:
predictions = Log_reg.predict(new_X_vec)
predictions

#Model 3 to test

In [None]:
SVCmodel = joblib.load('/content/drive/MyDrive/Colab_Notebooks/PROJEKT_GRUPA_3/models_joblib/svc_5.pkl')

In [None]:
predictions = SVCmodel.predict(new_X_vec)
predictions