# ДЗ 2. Создание признакового пространства

Продолжим обработку данных с Твиттера.

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import model_selection, preprocessing, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn import decomposition

import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os

### Загрузка и подготовка данных

In [2]:
combine_df = pd.read_pickle("data/combine_df.pkl")
combine_df.head()

Unnamed: 0,id,label,tweet,result,token,token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


In [3]:
# Преобразуем данные и выделим их в отдельный датафрейм
combine_df['tweet_stemmed'] = combine_df['tweet_stemmed'].apply(lambda tokens: ' '.join(tokens))
combine_df['tweet_lemmatized'] = combine_df['tweet_lemmatized'].apply(lambda tokens: ' '.join(tokens))

### Мешок слов с помощью CountVectorizer

**Создайте мешок слов с помощью sklearn.feature_extraction.text.CountVectorizer.fit_transform()**  

Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.  

- Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.  
- Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.  
- Исключим стоп-слова с помощью stop_words='english'.  
- Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью CountVectorizer.get_feature_names().  

In [4]:
count_vectorizer = CountVectorizer(ngram_range=(1, 1), 
                                   analyzer='word', 
                                   binary=False, 
                                   tokenizer=str.split, 
                                   stop_words="english", 
                                   max_df=0.9, 
                                   max_features=1000)

In [5]:
# Создаем the Bag-of-Words модель для tweet_stemmed
bag_of_words_stemmed = count_vectorizer.fit_transform(combine_df['tweet_stemmed'])

In [6]:
# Отобразим Bag-of-Words модель как DataFrame
feature_names = count_vectorizer.get_feature_names()
stemmed_count = pd.DataFrame(bag_of_words_stemmed.toarray(), columns = feature_names)
stemmed_count.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Создаем the Bag-of-Words модель для tweet_lemmatized
bag_of_words_lemmatized = count_vectorizer.fit_transform(combine_df['tweet_lemmatized'])

In [8]:
# Отобразим Bag-of-Words модель как DataFrame
feature_names = count_vectorizer.get_feature_names()
lemmatized_count = pd.DataFrame(bag_of_words_lemmatized.toarray(), columns = feature_names)
lemmatized_count.head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Мешок слов с помощью TfidfVectorizer

**Создайте мешок слов с помощью sklearn.feature_extraction.text.TfidfVectorizer.fit_transform()**

Применим его к 'tweet_stemmed' и 'tweet_lemmatized' отдельно.  

- Игнорируем слова, частота которых в документе строго превышает порог 0.9 с помощью max_df.  
- Ограничим количество слов, попадающий в мешок, с помощью max_features = 1000.  
- Исключим стоп-слова с помощью stop_words='english'.  
- Отобразим Bag-of-Words модель как DataFrame. columns необходимо извлечь с помощью TfidfVectorizer.get_feature_names().  

In [9]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), 
                                   analyzer='word', 
                                   binary=False, 
                                   tokenizer=str.split, 
                                   stop_words='english', 
                                   max_df=0.9, 
                                   max_features=1000)

In [10]:
# Создаем the Bag-of-Words модель для tweet_stemmed
bag_of_words_stemmed = tfidf_vectorizer.fit_transform(combine_df['tweet_stemmed'])

In [11]:
# Отобразим Bag-of-Words модель как DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
stemmed_tfidf = pd.DataFrame(bag_of_words_stemmed.toarray(), columns=feature_names)
stemmed_tfidf.head()

Unnamed: 0,abl,absolut,accept,account,act,action,actor,actual,ad,adapt,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Создаем the Bag-of-Words модель для tweet_lemmatized
bag_of_words_lemmatized = tfidf_vectorizer.fit_transform(combine_df['tweet_lemmatized'])

In [13]:
# Отобразим Bag-of-Words модель как DataFrame
feature_names = tfidf_vectorizer.get_feature_names()
lemmatized_tfidf = pd.DataFrame(bag_of_words_lemmatized.toarray(), columns=feature_names)
lemmatized_tfidf.head()

Unnamed: 0,able,absolutely,account,act,action,actor,actually,adapt,add,adventure,...,year,yes,yesterday,yo,yoga,york,young,youtube,yr,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Сравнение векторайзеров

Проверьте ваши векторайзеры на корпусе который использовали на вебинаре, составьте таблицу метод векторизации и скор который вы получили (в методах векторизации по изменяйте параметры что бы добиться лучшего скора) обратите внимание как падает/растёт скор при уменьшении количества фичей, и изменении параметров, так же попробуйте применить к векторайзерам PCA для сокращения размерности посмотрите на качество сделайте выводы.

**Base score**

In [14]:
combine_df['label'] = combine_df['label'].apply(lambda y: 0 if np.isnan(y) else y)
y = combine_df['label'].astype('float64')

In [15]:
pd.options.display.float_format = '{:.8f}'.format

In [18]:
features = [['stemmed_count', stemmed_count], ['stemmed_tfidf', stemmed_tfidf], 
           ['lemmatized_count', lemmatized_count], ['lemmatized_tfidf', lemmatized_tfidf]]

In [19]:
results = []

for feature in features:
    X_train, X_test, y_train, y_test = model_selection.train_test_split(feature[1], y)
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results.append([
        feature[0], 
        mean_squared_error(y_test, y_pred), 
        mean_absolute_error(y_test, y_pred),
        median_absolute_error(y_test, y_pred), 
        r2_score(y_test, y_pred)
    ])

In [20]:
score_df = pd.DataFrame(results, columns=['model', 'mse', 'mae', 'median', 'r2 score'])
score_df

Unnamed: 0,model,mse,mae,median,r2 score
0,stemmed_count,0.03566267,0.08347376,0.03426095,0.17008151
1,stemmed_tfidf,0.0350199,0.08205668,0.02947686,0.19747529
2,lemmatized_count,1.0615079800861264e+19,29389059.72624626,0.03780336,-2.5724316117142687e+20
3,lemmatized_tfidf,1.370113735014475e+18,10558503.59014027,0.03061897,-3.118660824446664e+19


**Уменьшение размерности**

In [21]:
max_features = [500, 3000]

In [22]:
for feature in max_features:
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   analyzer='word', 
                                   binary=False, 
                                   tokenizer=str.split, 
                                   stop_words='english', 
                                   max_df=0.9, 
                                   max_features=feature)

    X_train, X_test, y_train, y_test = model_selection.train_test_split(lemmatized_tfidf, y)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append([
        f'{max_features} lemmatized tfidf',
        mean_squared_error(y_test, y_pred), 
        mean_absolute_error(y_test, y_pred),
        median_absolute_error(y_test, y_pred), 
        r2_score(y_test, y_pred)
    ])

In [23]:
score_df = pd.DataFrame(results, columns=['model', 'mse', 'mae', 'median', 'r2 score'])
score_df

Unnamed: 0,model,mse,mae,median,r2 score
0,stemmed_count,0.03566267,0.08347376,0.03426095,0.17008151
1,stemmed_tfidf,0.0350199,0.08205668,0.02947686,0.19747529
2,lemmatized_count,1.0615079800861264e+19,29389059.72624626,0.03780336,-2.5724316117142687e+20
3,lemmatized_tfidf,1.370113735014475e+18,10558503.59014027,0.03061897,-3.118660824446664e+19
4,"[500, 3000] lemmatized tfidf",0.03617191,0.08231934,0.0289344,0.21857878
5,"[500, 3000] lemmatized tfidf",2.838143080018048e+19,58322493.56283243,0.02991418,-6.482000464394361e+20


**PCA**

In [24]:
max_features = [50, 300]

In [25]:
for feature in max_features:
    pca = decomposition.PCA(n_components=feature)
    pca.fit(lemmatized_tfidf)

    lemmatized_tfidf_pca = pca.transform(lemmatized_tfidf)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(lemmatized_tfidf_pca, y)

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results.append([
        f'PCA {max_features} lemmatized tfidf',
        mean_squared_error(y_test, y_pred), 
        mean_absolute_error(y_test, y_pred),
        median_absolute_error(y_test, y_pred), 
        r2_score(y_test, y_pred)])

In [26]:
score_df = pd.DataFrame(results, columns=['model', 'mse', 'mae', 'median', 'r2 score'])
score_df

Unnamed: 0,model,mse,mae,median,r2 score
0,stemmed_count,0.03566267,0.08347376,0.03426095,0.17008151
1,stemmed_tfidf,0.0350199,0.08205668,0.02947686,0.19747529
2,lemmatized_count,1.0615079800861264e+19,29389059.72624626,0.03780336,-2.5724316117142687e+20
3,lemmatized_tfidf,1.370113735014475e+18,10558503.59014027,0.03061897,-3.118660824446664e+19
4,"[500, 3000] lemmatized tfidf",0.03617191,0.08231934,0.0289344,0.21857878
5,"[500, 3000] lemmatized tfidf",2.838143080018048e+19,58322493.56283243,0.02991418,-6.482000464394361e+20
6,"PCA [50, 300] lemmatized tfidf",0.04080016,0.08850586,0.04956678,0.07596445
7,"PCA [50, 300] lemmatized tfidf",0.03721058,0.0829861,0.03160749,0.19486282
