# Notebook to pipeline

## Setup

In [None]:
!pip install --user -r requirements.txt

In [None]:
from pathlib import Path
import wget
import pandas as pd
import zipfile
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

## Hyperparameters

## Load data

In [None]:
target_dir = './data/'
data_url = 'https://files.consumerfinance.gov/ccdb/complaints.csv.zip'
seed = 42
sample_size = 10_000

In [None]:
file_name = wget.download(data_url, out=target_dir)

with zipfile.ZipFile(file_name, 'r') as zip_ref:
    zip_ref.extractall(target_dir)

In [None]:
df = pd.read_csv(Path().cwd() / target_dir / zip_ref.namelist()[0])

## Preprocess data

In [None]:
df = df[['Product', 'Consumer complaint narrative']]
df = df.dropna()
df.columns = ['product', 'consumer_complaint_narrative']
df['category_id'] = df['product'].factorize()[0]
df = df.sample(n=sample_size, random_state=seed)

In [None]:
category_id_df = df[['product', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'product']].values)

X_train, X_test, y_train, y_test = train_test_split(df['consumer_complaint_narrative'], df['category_id'], test_size=0.2, random_state=seed)

## Define and train the model

In [None]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LinearSVC()),
])
text_clf.fit(X_train, y_train)

## Evaluate the model

In [None]:
y_pred = text_clf.predict(X_test)
report = classification_report(y_test, y_pred)

f1 = f1_score(y_test, y_pred, average='macro')

In [None]:
print(report)