In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import gensim.downloader
import pandas as pd
import numpy as np
import pickle
from collections import Counter
from tqdm import tqdm


In [13]:
from gensim.models import Word2Vec

In [14]:
class Naive:
    def fit(self, data, target):
        self.target = Counter(target).most_common(1)
    def predict(self, data):
        lens = data.shape[0]
        return [self.target[0][0]] * lens

In [39]:
def get_data(columns, prefix):
    name_file = f"./data/{columns}_{prefix}.pickle"
    with open(name_file, 'rb') as f:
        matrix = pickle.load(f)
    mas = matrix.toarray()
    if prefix == 'word_exist':
        mas = (matrix != 0).astype('int')
    return mas

with open('./data/data.pickle', 'rb') as f:
    df = pickle.load(f)
df = df.reset_index()

In [25]:
def predict_result(model):
    itog_tabel = []
    for prefix in tqdm(prefix_mas):
        for columns in tqdm(data_columns):
            matrix = get_data(columns, prefix)
            bool_mas = df[columns].isna()
            target = labelencoder.fit_transform(df.loc[~bool_mas, 'target'])
            X_train, X_test, y_train, y_test = train_test_split(matrix, target, test_size=0.2, random_state=42, stratify = target)
            model.fit(X_train, y_train)
            accuracy = accuracy_score(y_test, model.predict(X_test))
            itog_tabel.append([prefix, columns, accuracy])
    return itog_tabel

In [17]:
labelencoder = LabelEncoder()
target = labelencoder.fit_transform(df['target'])

In [18]:
data_columns = ['my_preproces', 'just_token', 'stemming', 'lemmatization', 
'stemming+', 'misspelling', 'lemmatization + misspelling']
prefix_mas = ['word_exist', 'tfidf', 'word_count']

In [19]:
bool_mas = (df['my_preproces'].isna() | df['my_preproces'].duplicated())
target = labelencoder.fit_transform(df.loc[~bool_mas, 'target'])

## Итоговая таблица 

In [22]:
model = LogisticRegression(n_jobs=-1, random_state=42)
columns = 'my_preproces'
prefix = 'word_exist'
matrix = get_data(columns, prefix)
bool_mas = df[columns].isna()
target = labelencoder.fit_transform(df.loc[~bool_mas, 'target'])
X_train, X_test, y_train, y_test = train_test_split(matrix, target, test_size=0.2, random_state=42, stratify = target)
model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, model.predict(X_test))
[prefix, columns, accuracy]

['word_exist', 'my_preproces', 0.88]

In [40]:
logreg = LogisticRegression(n_jobs=-1, random_state=42)
logreg_result = predict_result(logreg)

In [27]:
reg = pd.DataFrame(logreg_result, columns = ['вес слова', 'тип обработки', 'качество'])
reg.to_csv('data/logreg_result.csv')

In [28]:
naive = Naive()
naive_result = predict_result(naive)

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A
 43%|████▎     | 3/7 [00:00<00:00, 22.14it/s][A
100%|██████████| 7/7 [00:00<00:00, 24.07it/s][A
 33%|███▎      | 1/3 [00:00<00:00,  3.41it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:00<00:00,  8.94it/s][A
 29%|██▊       | 2/7 [00:00<00:00,  5.80it/s][A
 43%|████▎     | 3/7 [00:00<00:00,  5.51it/s][A
 57%|█████▋    | 4/7 [00:00<00:00,  5.40it/s][A
 71%|███████▏  | 5/7 [00:00<00:00,  5.85it/s][A
 86%|████████▌ | 6/7 [00:00<00:00,  6.64it/s][A
100%|██████████| 7/7 [00:01<00:00,  6.36it/s][A
 67%|██████▋   | 2/3 [00:01<00:00,  1.30it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:00<00:00,  8.64it/s][A
 29%|██▊       | 2/7 [00:00<00:00,  5.97it/s][A
 43%|████▎     | 3/7 [00:00<00:00,  6.31it/s][A
 57%|█████▋    | 4/7 [00:00<00:00,  5.76it/s][A
 71%|███████▏  | 5/7 [00:00<00:00,  6.22it/s][A
100%|██████████| 7/7 [00:00<00:00,  7.05it/s][A
100%|███████

In [29]:
naive = pd.DataFrame(naive_result, columns = ['вес слова', 'тип обработки', 'качество'])
naive.to_csv('data/naive_result.csv')

In [30]:
# naive

## GridSearch

In [32]:
columns = 'my_preproces'
prefix = 'word_exist'

parameters = {'penalty':('l1', 'l2', 'none')}
model = LogisticRegression(n_jobs=-1, random_state=42)
grid = GridSearchCV(model, parameters)


matrix = get_data(columns, prefix)
X_train, X_test, y_train, y_test = train_test_split(matrix, target, test_size=0.2, random_state=42, stratify = target)
grid.fit(X_train, y_train)

# print(grid.best_score_)
print(grid.best_params_)

accuracy = accuracy_score(y_test, grid.predict(X_test))
tabel = [prefix, columns, accuracy]

print(tabel)

Traceback (most recent call last):
  File "/Users/ila/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/ila/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1304, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/ila/opt/anaconda3/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 442, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



{'penalty': 'l2'}
['word_exist', 'my_preproces', 0.88]


## word2vec

In [33]:
glove_vectors = gensim.downloader.load('glove-twitter-25')

In [34]:
def text_to_vec(sen):
    try:
        vec = [glove_vectors.wv[str(word)] for word in sen if str(word) in glove_vectors.wv]
        conc_vec = np.concatenate(vec)
        conc_vec_zer = np.append(conc_vec, np.zeros(500 - len(conc_vec)))
        conc_vec_zer = [float(i) for i in conc_vec_zer]
        return list(conc_vec_zer)
    except:
        return None

In [35]:
vec = df['my_preproces'].apply(lambda sen: text_to_vec(sen))
target_min = target[~vec.isna()]
vec = vec[~vec.isna()]
vec = np.array(list(vec))

  vec = [glove_vectors.wv[str(word)] for word in sen if str(word) in glove_vectors.wv]


In [36]:
model = RandomForestClassifier(n_jobs=-1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(vec, target_min, test_size=0.2, random_state=42, stratify = target_min)
model.fit(X_train, y_train)
accuracy = accuracy_score(y_test, model.predict(X_test))
accuracy

0.8636959370904325