## LOAD DATA

In [1]:
import pandas as pd
from pathlib import Path

df = pd.read_csv(Path().absolute().parent.parent / "data/text_class.csv")

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

import sys

sys.path.insert(0, str(Path().absolute().parent.parent))
from src.text_normalization import normalize_texts


def tfidf_texts(texts, ngram_range=(1, 2)):
    corpus = normalize_texts(texts)
    vectorizer = TfidfVectorizer(ngram_range=ngram_range).fit(corpus)
    return (vectorizer, vectorizer.transform(corpus).toarray())

In [31]:
vectorizer, X = tfidf_texts([
    "My name is Danilo Carlotti",
    "My name is Danilo Carlotti",
    "My name is Danilo Carlotti",
    "My name is Danilo Carlotti",
    "My name is Danilo Carlotti",
    "Call me Ishmael",
    "Call me Ishmael",
    "Call me Ishmael",
    "Call me Ishmael",
    "Call me Ishmael",
])
y = [1,1,1,1,1,0,0,0,0,0]

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix

def train_model(X, y, with_smote=True):
    rows = []
    clf = LogisticRegression(penalty="l1", solver="liblinear")
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    if with_smote:
        oversample = SMOTE()
        X_train, y_train = oversample.fit_resample(X_train, y_train)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("tn, fp, fn, tp")
    print(confusion_matrix(y_test, y_pred).ravel())
    return clf

def words_interest_log_reg(classifier):
    var_names = vectorizer.get_feature_names()
    variables_of_interest = []
    for beta in range(len(classifier.coef_[0])):
        if classifier.coef_[0][beta]:
            variables_of_interest.append((classifier.coef_[0][beta], var_names[beta]))
    variables_of_interest.sort()
    return variables_of_interest

In [32]:
clf = train_model(X, y, with_smote=False)

tn, fp, fn, tp
[1 0 2 0]


In [33]:
print(words_interest_log_reg(clf))

[(-0.4641016151363621, 'call'), (-0.00280673930170718, 'call ishmael')]
