### Создание признакового пространства

1. Создайте мешок слов с помощью sklearn.feature_extraction.text.CountVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
 * Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
 * Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
 * Исключим стоп-слова с помощью stop_words='english'. 
 * Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью   CountVectorizer.get_feature_names().

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_pickle('df.pkl')
df.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


In [3]:
list_stemmed = np.array(df['tweet_stemmed'].apply(lambda x: " ".join(x)))
list_lemmatized = np.array(df['tweet_lemmatized'].apply(lambda x: " ".join(x)))

In [4]:
stemmed_count_vectorizer = CountVectorizer(max_df = 0.9, max_features = 1000, stop_words = 'english')
lemmatized_count_vectorizer = CountVectorizer(max_df = 0.9, max_features = 1000, stop_words = 'english')

In [5]:
bag_count_stemmed = stemmed_count_vectorizer.fit_transform(list_stemmed)

In [6]:
feature_names_stemmed = stemmed_count_vectorizer.get_feature_names()
pd.DataFrame(bag_count_stemmed.toarray(), columns = feature_names_stemmed).head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
bag_count_lemmatized = lemmatized_count_vectorizer.fit_transform(list_lemmatized)

In [8]:
feature_names_lemmatized = lemmatized_count_vectorizer.get_feature_names()
pd.DataFrame(bag_count_lemmatized.toarray(), columns = feature_names_lemmatized).head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


2. Создайте мешок слов с помощью sklearn.feature_extraction.text.TfidfVectorizer.fit_transform(). Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.
 * Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.
 * Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.
 * Исключим стоп-слова с помощью stop_words='english'.
 * Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью TfidfVectorizer.get_feature_names().

In [9]:
stemmed_tfidf_vectorizer = TfidfVectorizer(max_df = 0.9, max_features = 1000, stop_words = 'english')
lemmatized_tfidf_vectorizer = TfidfVectorizer(max_df = 0.9, max_features = 1000, stop_words = 'english')

In [10]:
bag_tfidf_stemmed = stemmed_tfidf_vectorizer.fit_transform(list_stemmed)

In [11]:
feature_names_stemmed = stemmed_tfidf_vectorizer.get_feature_names()
pd.DataFrame(bag_tfidf_stemmed.toarray(), columns = feature_names_stemmed).head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
bag_tfidf_lemmatized = lemmatized_tfidf_vectorizer.fit_transform(list_lemmatized)

In [13]:
feature_names_lemmatized = lemmatized_tfidf_vectorizer.get_feature_names()
pd.DataFrame(bag_tfidf_lemmatized.toarray(), columns = feature_names_lemmatized).head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


3. Проверьте ваши векторайзеры на корпусе который использовали на вебинаре, составьте таблицу метод векторизации и скор который вы получили (в методах векторизации по изменяйте параметры что бы добиться лучшего скора) обратите внимание как падает/растёт скор при уменьшении количества фичей, и изменении параметров, так же попробуйте применить к векторайзерам PCA для сокращения размерности посмотрите на качество сделайте выводы

In [14]:
data = open('corpus.txt').read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# создаем df
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels
trainDF.head(5)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,__label__2
1,The best soundtrack ever to anything.: I'm rea...,__label__2
2,Amazing!: This soundtrack is my favorite music...,__label__2
3,Excellent Soundtrack: I truly like this soundt...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After He...",__label__2


In [15]:
from sklearn import model_selection, preprocessing, linear_model
from sklearn.metrics import accuracy_score

In [16]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(trainDF['text'], trainDF['label'])

In [17]:
classifier = linear_model.LogisticRegression()

# labelEncode целевую переменную
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

vectorizers = []

vectorizers.append(('CountVectorizer', 
                    CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english')))
vectorizers.append(('TfidfVectorizer', 
                    TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english')))

results = []

for name, vectorizer in vectorizers:
    vectorizer.fit(trainDF['text'])
    
    xtrain_vect = vectorizer.transform(X_train)
    xtest_vect = vectorizer.transform(X_test)
    
    classifier.fit(xtrain_vect, y_train)
    predictions = classifier.predict(xtest_vect)
    
    f1_score = accuracy_score(y_test, predictions)

    results.append((name, f1_score))

stock_model = pd.DataFrame(results, columns=['model', 'f1_score'])
stock_model.sort_values('f1_score', ascending=False)

Unnamed: 0,model,f1_score
1,TfidfVectorizer,0.852
0,CountVectorizer,0.8496


Подбор параметров

In [18]:
max_df = [1, 0.9, 0.5, 0.1]
max_features = [3000, 1500, 1000, 350]

In [19]:
vectorizers = []

for i in max_df:
    for j in max_features:
        vectorizers.append((f'CountVectorizer_max_df{i}_max_feat{j}',
                       CountVectorizer(analyzer='word',
                                       token_pattern=r'\w{1,}',
                                       max_df=i,
                                       max_features=j,
                                       stop_words='english')))
        vectorizers.append((f'TfidfVectorizer_max_df{i}_max_feat{j}',
                       TfidfVectorizer(analyzer='word',
                                       token_pattern=r'\w{1,}',
                                       max_df=i,
                                       max_features=j,
                                       stop_words='english')))

In [20]:
results = []

for name, vectorizer in vectorizers:

    vectorizer.fit(trainDF['text'])
    
    xtrain_vect = vectorizer.transform(X_train)
    xtest_vect = vectorizer.transform(X_test)

    classifier.fit(xtrain_vect, y_train)
    predictions = classifier.predict(xtest_vect)

    f1_score = accuracy_score(y_test, predictions)

    results.append((name, f1_score))

different_models = pd.DataFrame(results, columns=['model', 'f1_score'])
different_models.sort_values('f1_score', ascending=False)

Unnamed: 0,model,f1_score
17,TfidfVectorizer_max_df0.5_max_feat3000,0.854
9,TfidfVectorizer_max_df0.9_max_feat3000,0.854
11,TfidfVectorizer_max_df0.9_max_feat1500,0.8492
19,TfidfVectorizer_max_df0.5_max_feat1500,0.8492
25,TfidfVectorizer_max_df0.1_max_feat3000,0.8424
8,CountVectorizer_max_df0.9_max_feat3000,0.8404
16,CountVectorizer_max_df0.5_max_feat3000,0.8404
13,TfidfVectorizer_max_df0.9_max_feat1000,0.8372
21,TfidfVectorizer_max_df0.5_max_feat1000,0.8372
27,TfidfVectorizer_max_df0.1_max_feat1500,0.832


In [21]:
final_result = pd.concat([different_models, stock_model], axis=0)
final_result.sort_values('f1_score', ascending=False)

Unnamed: 0,model,f1_score
17,TfidfVectorizer_max_df0.5_max_feat3000,0.854
9,TfidfVectorizer_max_df0.9_max_feat3000,0.854
1,TfidfVectorizer,0.852
0,CountVectorizer,0.8496
19,TfidfVectorizer_max_df0.5_max_feat1500,0.8492
11,TfidfVectorizer_max_df0.9_max_feat1500,0.8492
25,TfidfVectorizer_max_df0.1_max_feat3000,0.8424
8,CountVectorizer_max_df0.9_max_feat3000,0.8404
16,CountVectorizer_max_df0.5_max_feat3000,0.8404
13,TfidfVectorizer_max_df0.9_max_feat1000,0.8372


### Выводы

1. Лучший результат показал TfidfVectorizer с параметрами max_df = 0.5 и max_features = 3000 - 0.8540.
2. Параметры влияют на f1_score, в конкретном случае при заданных значениях параметров результаты f1_score лучше чем стоковые.

In [22]:
from sklearn.decomposition import TruncatedSVD

In [23]:
vect = []

vect.append(('CountVectorizer', 
                    CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english')))
vect.append(('TfidfVectorizer', 
                    TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', stop_words='english')))

In [24]:
results = []
for name, vectorizer in vect:
    for n in [100, 200, 500, 1000]:
        vectorizer.fit(trainDF['text'])

        svd = TruncatedSVD(n_components=n, random_state=42)

        xtrain_vect = vectorizer.transform(X_train)
        xtest_vect = vectorizer.transform(X_test)

        xtrain_vect_svd = svd.fit_transform(xtrain_vect)
        xtest_vect_svd = svd.transform(xtest_vect)

        classifier.fit(xtrain_vect_svd, y_train)
        predictions = classifier.predict(xtest_vect_svd)

        f1_score = accuracy_score(y_test, predictions)

        results.append((str(name) + '_' + str(n), f1_score))
pca_model = pd.DataFrame(results, columns=['model', 'f1_score'])
pca_model.sort_values('f1_score', ascending=False)

Unnamed: 0,model,f1_score
3,CountVectorizer_1000,0.8492
7,TfidfVectorizer_1000,0.8464
6,TfidfVectorizer_500,0.8448
2,CountVectorizer_500,0.8424
5,TfidfVectorizer_200,0.8292
1,CountVectorizer_200,0.8236
4,TfidfVectorizer_100,0.8164
0,CountVectorizer_100,0.8028


### Вывод

1. Уменьшение размерности матрицы методом SVD ухудшило результаты. Тем не менее в некоторых случаях полезно применить всё таки SVD. 