In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_selection import SelectFromModel

import warnings
warnings.filterwarnings("ignore")

In [2]:
positive = pd.read_csv('positive.csv', sep=';', usecols=[3], names=['text'])
positive['label'] = ['positive'] * len(positive)

negative = pd.read_csv('negative.csv', sep=';', usecols=[3], names=['text'])
negative['label'] = ['negative'] * len(negative)

df = positive.append(negative)

In [3]:
df.head(4)

Unnamed: 0,text,label
0,"@first_timee хоть я и школота, но поверь, у на...",positive
1,"Да, все-таки он немного похож на него. Но мой ...",positive
2,RT @KatiaCheh: Ну ты идиотка) я испугалась за ...,positive
3,"RT @digger2912: ""Кто то в углу сидит и погибае...",positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226834 entries, 0 to 111922
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    226834 non-null  object
 1   label   226834 non-null  object
dtypes: object(2)
memory usage: 5.2+ MB


In [5]:
x_train, x_test, y_train, y_test = train_test_split(df.text, df.label)

### Задание 1.

Задание: обучите три классификатора:

1) На токенах с высокой частотой

2) На токенах со средней частотой

3) На токенах с низкой частотой

Сравните полученные результаты, оцените какие токены наиболее важные для классификации.

In [6]:
vec = CountVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=41)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.76      0.76      0.76     27846
    positive       0.77      0.76      0.77     28863

    accuracy                           0.76     56709
   macro avg       0.76      0.76      0.76     56709
weighted avg       0.76      0.76      0.76     56709



### Задание 2.

Найти фичи с наибольшей значимостью и вывести их.

In [7]:
smf = SelectFromModel(clf, threshold=-np.inf, max_features=10)
smf.fit(bow, y_train)
feature_idx = smf.get_support()
feature_name = bow[feature_idx]

In [8]:
print(clf.coef_)

[[ 0.06476293  0.25575382  0.7494573  ...  0.02641659 -0.19247105
  -0.19247105]]


### Задание 3.

1) Сравнить count/tf-idf/hashing векторайзеры/полносвязанную сетку (построить classification_report)

2) Подобрать оптимальный размер для hashing векторайзера

3) Убедиться что для сетки нет переобучения

#### Count векторайзер

In [9]:
vec = CountVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=41)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.76      0.76      0.76     27846
    positive       0.77      0.76      0.77     28863

    accuracy                           0.76     56709
   macro avg       0.76      0.76      0.76     56709
weighted avg       0.76      0.76      0.76     56709



#### TF-IDF векторайзер

In [10]:
vec = TfidfVectorizer(ngram_range=(1, 1))
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=41)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.73      0.77      0.75     26695
    positive       0.78      0.75      0.77     30014

    accuracy                           0.76     56709
   macro avg       0.76      0.76      0.76     56709
weighted avg       0.76      0.76      0.76     56709



#### Hashing векторайзер

In [11]:
vec = HashingVectorizer(n_features=2**4)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=41)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.50      0.54      0.52     25965
    positive       0.59      0.55      0.57     30744

    accuracy                           0.54     56709
   macro avg       0.54      0.54      0.54     56709
weighted avg       0.55      0.54      0.55     56709



Попробуем также hashing векторайзер с большим количеством признаком и с применением L2-регуляризации:

In [12]:
vec = HashingVectorizer(n_features=2**20)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=41)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.72      0.76      0.74     26712
    positive       0.77      0.74      0.76     29997

    accuracy                           0.75     56709
   macro avg       0.75      0.75      0.75     56709
weighted avg       0.75      0.75      0.75     56709



In [13]:
vec = HashingVectorizer(n_features=2**20, norm='l2', alternate_sign=False)
bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=41)
clf.fit(bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.72      0.76      0.74     26732
    positive       0.77      0.74      0.76     29977

    accuracy                           0.75     56709
   macro avg       0.75      0.75      0.75     56709
weighted avg       0.75      0.75      0.75     56709

