In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pickle
import pdfhelper

charts = False

In [None]:
# To generate data, run this: python3 readpdf.py  -C
tbs = pd.read_csv('cells.csv')

In [None]:
if charts:
    sns.displot(tbs, x="dx_raw", hue='col_idx')

In [None]:
if charts:
    sns.displot(tbs, x="len", hue='col_idx')

In [None]:
if charts:
    features = ['int', 'starts_ws', 'ends_ws', 'all_letters']
    fig, axes = plt.subplots(len(features))

    for i, f in enumerate(features):
        sns.displot(tbs, x=f, hue='col_idx', ax=axes[i]).set(title=f)

In [None]:
X, y = pdfhelper.preprocess_tbs_data(tbs)

In [None]:
# drop the output variable
#y = cells['col']
#y = cells['col_idx']
#X = cells.drop(columns=['col', 'col_idx'])

# drop input variables which only complicate things
#to_drop = ['y', 'text']
#X = X.drop(to_drop,axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=14)

In [None]:
cont_vars = list(X_train.select_dtypes(include = float).columns)

cont_pipeline = make_pipeline(
    SimpleImputer(strategy = 'median'),
    StandardScaler()
)

# test to make sure the pipeline works
pd.DataFrame(cont_pipeline.fit_transform(X_train[cont_vars]), columns = cont_vars);

In [None]:
disc_vars = list(X_train.select_dtypes(include = int).columns)

if disc_vars:
    disc_pipeline = make_pipeline(
        SimpleImputer(strategy = 'constant', fill_value = -1)
    )

    pd.DataFrame(disc_pipeline.fit_transform(train[disc_vars]), columns = disc_vars);

In [None]:
cat_vars = []

if cat_vars:
    cat_pipeline = make_pipeline(
        SimpleImputer(strategy = 'constant', fill_value = 'unknown'),
        OneHotEncoder()
    )

    cat_pipeline.fit_transform(train[cat_vars]);
    #cat_pipeline.named_steps['onehotencoder'].get_feature_names(['original_lang','release_season']);

In [None]:
preprocessor = ColumnTransformer(
    transformers = [
        ('continuous', cont_pipeline, cont_vars)
#        ('discrete', disc_pipeline, disc_vars),
#        ('categorical', cat_pipeline, cat_vars),
#        ('json', json_pipeline, json_vars)
    ],
    remainder='passthrough'
)

preprocessor.fit(X_train)
                     #cat_pipeline.named_steps['onehotencoder'].get_feature_names(['original_lang','release_season']),
                      #json_pipeline.named_steps['topcatencoder'].get_feature_names()))
        
preprocessor.get_feature_names_out()

In [None]:
import warnings; warnings.simplefilter('ignore')

import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoLars
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from statistics import mean

def quick_eval(pipeline, X, y, verbose=True):
    """
    Quickly trains modeling pipeline and evaluates on test data. Returns original model and cross-validation score
    as a tuple.
    """
    
    scores = cross_val_score(pipeline, X, y, cv=10, scoring='accuracy')
    score = mean(scores)

    if verbose:
        print(f"Algorithm: {pipeline.named_steps['classifier'].__class__.__name__}")
        print(f"CV score: {score}")
    
    return pipeline.named_steps['classifier'], score

ccp_alpha=0 # 0.03

classifiers = [LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
classifiers = [DecisionTreeClassifier(max_depth=n, ccp_alpha=ccp_alpha) for n in [10,15,20,25]]
regressors = [
    LinearRegression(),
    Lasso(alpha=.5),
    Ridge(alpha=.1),
    LassoLars(alpha=.1),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    AdaBoostRegressor(),
    GradientBoostingRegressor()
]

for r in classifiers:
    pipe = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('classifier', r)
    ])

    quick_eval(pipe, X_train, y_train)
    print()

In [None]:
r = DecisionTreeClassifier(max_depth=25, ccp_alpha=0.03)
pipe = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('classifier', r)
])
m = pipe.fit(X_train, y_train)
clf = m.named_steps['classifier']
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")

In [None]:
nms = list(m.named_steps['preprocessor'].get_feature_names_out())
t = sklearn.tree.export_text(m.named_steps['classifier'], feature_names=nms)
print("{} nodes".format(clf.tree_.node_count))
print(t)

In [None]:
y_pred = pipe.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

In [None]:
cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=clf.classes_)
disp.plot()
plt.show()

In [None]:
with open('col_classifier.pkl', 'wb') as file:
    pickle.dump(pipe, file)

In [None]:
for index, row in X_test.iterrows():
    row_dict = row.to_dict()

In [None]:
p = pd.read_csv('table.csv')
#f = 'מספר בקשה'
#p.rename(columns={f: 'n'}, inplace=True)
#p.query('n == 950')

In [None]:
# quality check
for c in p.columns:
    print(c)
    print(p[c].value_counts())