In [2]:
import os
import pandas as pd
import re
import nltk
import string
import numpy as np

DATA_PATH = "data/preprocess"
# nltk.download("stopwords")

In [4]:
os.chdir("./../")

In [5]:
df_train = pd.read_parquet(os.path.join(DATA_PATH, "train.parquet"))
df_valid = pd.read_parquet(os.path.join(DATA_PATH, "valid.parquet"))
df_test = pd.read_parquet(os.path.join(DATA_PATH, "test.parquet"))

print ("Train set:", df_train.shape)
print ("Valid set:", df_valid.shape)
print ("Test set:", df_test.shape)

df_train.tail()

Train set: (6090, 14)
Valid set: (1523, 14)
Test set: (3263, 13)


Unnamed: 0_level_0,keyword,location,text,target,profile_tags,hash_tags,link_tags,n_profile_tags,n_hash_tags,n_link_tags,exclamation_count,question_count,location_ner,with_location
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
7025,mayhem,"Manavadar, Gujarat",They are the real heroes... RIP Brave hearts.....,0,[],[],[],0,0,0,0,0,"{'3rd Eye Chakra': None, '@symbolicjensen': No...",0
4689,engulfed,USA,Car engulfed in flames backs up traffic at Par...,1,[],[],[],0,0,0,0,0,"{'3rd Eye Chakra': None, '@symbolicjensen': No...",0
2388,collapsed,"Alexandria, Egypt.",Great British Bake Off's back and Dorret's cho...,1,[],[],[],0,0,0,0,0,"{'3rd Eye Chakra': None, '@symbolicjensen': No...",0
3742,destroyed,USA,Black Eye 9: A space battle occurred at Star O...,0,[],[],[],0,0,0,0,0,"{'3rd Eye Chakra': None, '@symbolicjensen': No...",0
3924,devastated,"Dorset, UK",???????????? @MikeParrActor absolutely devasta...,0,[@MikeParrActor],[#RossBarton],[],1,1,0,0,12,"{'3rd Eye Chakra': None, '@symbolicjensen': No...",1


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
df_train["target"].value_counts()

0    3456
1    2634
Name: target, dtype: int64

In [9]:
def clean_text(text):
    text = text.lower()

    stop_words = set(nltk.corpus.stopwords.words("english"))
    text = re.sub(f"'[a-z]", "", text)
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = nltk.tokenize.word_tokenize(text)
    stemmer = nltk.stem.porter.PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    text = " ".join(tokens)

    return text



In [10]:
df_train["text_"] = df_train["text"].apply(clean_text)
df_valid["text_"] = df_valid["text"].apply(clean_text)
df_test["text_"] = df_test["text"].apply(clean_text)

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.random import RandomState
from typing import Union

class TfIdfEmbedding(BaseEstimator, TransformerMixin):

    def __init__(self, column="text_clean") -> None:
        super().__init__()
        self.column = column
        self.vectorizer = TfidfVectorizer()

    def fit(self, X, y=None):

        if isinstance(X, pd.DataFrame):
            X = X[self.column].values

        self.vectorizer.fit(X)
        return self

    def transform(self, X, y=None):
        return self.vectorizer.transform(X).toarray()


# Embedding = TfIdfEmbedding
# embedding = Embedding()
# embedding.fit(X["text_"])
# embeddings = embedding.transform(X["text_"])
# embeddings

In [45]:
df_train.columns

Index(['keyword', 'location', 'text', 'target', 'profile_tags', 'hash_tags',
       'link_tags', 'n_profile_tags', 'n_hash_tags', 'n_link_tags',
       'exclamation_count', 'question_count', 'location_ner', 'with_location',
       'text_'],
      dtype='object')

In [56]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from src.models.utils import get_model
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer

Model = get_model("LogisticRegressionClassifier")
X = df_train[["text_"]]
y = df_train[["target"]]

class TfIdfEmbedding(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        print (type(X))
        if isinstance(X, pd.DataFrame):
            print (X.head())
            print (X.columns)
        self.vectorizer = TfidfVectorizer()
        self.vectorizer.fit(X)
        return self

    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = X.values

        X = self.vectorizer.transform(X)

        return X.toarray()

text_transformer = TfIdfEmbedding()
num_transformer = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("text_transformer", text_transformer, ["text_"]),
        # ("num_transformer", num_transformer, ['n_profile_tags', 'n_hash_tags', 'n_link_tags', 'exclamation_count', 'question_count', 'with_location'])
    ]
)

pipeline = Pipeline(
        steps=[
            # ("tfidf", TfidfVectorizer()),
            # ("scaler", StandardScaler()),
            ("preprocessor", preprocessor),
            ("decomposition", TruncatedSVD(n_components=500)),
            ("model", Model(max_iter=1000)),
        ]
    )

pipeline.fit(df_train.drop(columns=["target"]), df_train["target"])
# pipeline.predict(df_train["text_"])

<class 'pandas.core.frame.DataFrame'>
Index(['text_'], dtype='object')


AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [40]:
from sklearn.metrics import classification_report
y_true = df_valid["target"]
y_pred = pipeline.predict(df_valid["text_"].values)

print (classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.86      0.80       886
           1       0.75      0.60      0.67       637

    accuracy                           0.75      1523
   macro avg       0.75      0.73      0.73      1523
weighted avg       0.75      0.75      0.74      1523



In [43]:
TfidfVectorizer.__base__


sklearn.feature_extraction.text.CountVectorizer