# Warnings

In [None]:
import warnings
import os
import torch
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from transformers import logging
logging.set_verbosity_error()
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import AutoTokenizer, AutoModel
import tensorflow as tf


# Vectors

### Bow

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
# counts = Counter(words)
# cv = CountVectorizer(vocabulary=counts.keys())
# cv.fit_transform(i).toarray()[0].tolist()


vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

### tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=0.0005,
    ngram_range=(1, 2)
)
x = vectorizer.fit_transform(data)

### split & shuffle

In [None]:
def split_train_test(corpus, labels, ratios=False):
    X_train, X_test, y_train, y_test = [], [], [], []
    if ratios:
        for k, v in ratios.items():
            train_rate = v[1] / v[0]
            corpus_current, labels_current = data_by_class_selection(corpus, labels, k)
            x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(
                corpus_current, 
                labels_current, 
                test_size=train_rate, 
                random_state=41)
            X_train.extend(x_train_1)
            X_test.extend(x_test_1)
            y_train.extend(y_train_1)
            y_test.extend(y_test_1)
    else:
        for i in dict(pd.Series(labels).value_counts()).keys():
            corpus_current, labels_current = data_by_class_selection(corpus, labels, i)
            train_rate = 0.3
            x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(
                corpus_current, 
                labels_current, 
                test_size=train_rate, 
                random_state=41)
            X_train.extend(x_train_1)
            X_test.extend(x_test_1)
            y_train.extend(y_train_1)
            y_test.extend(y_test_1)
    return X_train, X_test, y_train, y_test

def data_by_class_selection(corpus, labels, c):
    new_corpus = []
    new_labels = []
    for d, l in zip(corpus, labels):
        if l == c:
            new_corpus.append(d)
            new_labels.append(l)
    return new_corpus, new_labels

def shuffle_data(corpus, labels):
    combined = list(zip(corpus, labels))
    random.shuffle(combined)
    corpus[:], labels[:] = zip(*combined)
    return corpus, labels

### preprocess

In [71]:
from pymystem3 import Mystem
import re
stemmer = Mystem()
def preprocess(t, stem=True):
    x = ''.join([i for i in t.lower() if i in 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя .'])  # <<<<<<< LOWER
    x = x.replace('.', ' ')
    if stem:
        x = ''.join(stemmer.lemmatize(x))
    else:
        pass
    x = ' '.join([i for i in x.split() if len(i) > 2])
    x = re.sub('\s+', ' ', x)
    return x.strip()

In [70]:
def clean_tesseract(x):
    import re
    x = re.sub(r'\"', ' ', x)
    x = x.replace('\\n', ' ')
    x = x.replace('\\', ' ')
    x = re.sub('\s+', ' ', x)
    return x.strip()

# Quality

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
m = confusion_matrix(y_true, y_pred)
new_m = []
l = []
print(m)
for i in m:
    s = sum(i)
    for ii in i:
        l.append(round(ii / s * 100))
    new_m.append(l)
    l = []
print(np.array(new_m))

print(f1_score(y_true, y_pred, average='macro'))
print(accuracy_score(y_true, y_pred))

In [None]:
print(__doc__)
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    
cnf_matrix = np.array(new_m).astype(int)

    
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure(num=None, figsize=(13, 11), dpi=80, facecolor='w', edgecolor='k')
plot_confusion_matrix(cnf_matrix, classes=list(pd.Series(y_pred).value_counts().keys()),
                      title='Confusion matrix huyatrix')

plt.show()

### Jaccard index

In [1]:
def jaccard(x1, x2):
    intersection = set(x1.split()).intersection(set(x2.split()))
    union = set(x1.split()).union(set(x2.split()))
    try:
        return float(format(len(intersection) / len(union), '.2f'))
    except ZeroDivisionError:
        return 233.
jaccard('Дяденька король цветной капусты хотца', 'Дяденька король цветной капусты')

0.8

# Open

In [5]:
(70*2)/60

2.3333333333333335

In [9]:
def get_files(path, ext):
    import os
    paths_to_files = []
    for root, dir, files in os.walk(path):
        if files:
            for f in files:
                if ext in f:
                    paths_to_files.append(os.path.join(root, f))
    if paths_to_files:
        return paths_to_files
    else:
        raise ValueError('No such directory')
# get_files('<path>', 'csv')

In [None]:
try:
    with open('personal_info_wall_media_sep152020.json') as json_file:
        data = json.load(json_file)
    data.update(user_data)
    json.dump(data, open("personal_info_wall_media_sep152020.json", 'w'))
except FileNotFoundError:
    json.dump(user_data, open("personal_info_wall_media_sep152020.json", 'w'))
user_data = {}

In [None]:
import csv
new_labels1, new_corpus1 = [], []
with open('type_march_.csv', 'rt') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for n, i in enumerate(readCSV):
        # print(i)
        # break
        if n != 0:
            new_labels1.append(i[0])
            new_corpus1.append(i[1])

# Notes

In [None]:
flat_list = [item for sublist in t for item in sublist]

In [None]:
from copy import copy
def top_n_max(series, top=2, diraction=max):
    series = list(set(series.tolist()))
    if diraction == max:
      top_result = sorted(series)[-top:]
    else:
      top_result = sorted(series)[:top]
    return top_result