# Предобработка текста с помощью Python

Осуществим предобработку данных с Твиттера, чтобы очищенные данные в дальнейшем использовать для задачи классификации. Данный датасет содержит негативные (label = 1) и нейтральные (label = 0) высказывания. Для работы объединим train_df и test_df.

Задания:

1. Удалим @user из всех твитов с помощью паттерна "@[\w]*". Для этого создадим функцию:
- для того, чтобы найти все вхождения паттерна в тексте, необходимо использовать re.findall(pattern, input_txt)
- для для замены @user на пробел, необходимо использовать re.sub()
2. Изменим регистр твитов на нижний с помощью .lower().
3. Заменим сокращения с апострофами (пример: ain't, can't) на пробел, используя apostrophe_dict. Для этого необходимо сделать функцию: для каждого слова в тексте проверить (for word in text.split()), если слово есть в словаре apostrophe_dict в качестве ключа (сокращенного слова), то заменить ключ на значение (полную версию слова).
4. Заменим сокращения на их полные формы, используя short_word_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте.
5. Заменим эмотиконы (пример: ":)" = "happy") на пробелы, используя emoticon_dict. Для этого воспользуемся функцией, используемой в предыдущем пункте.
6. Заменим пунктуацию на пробелы, используя re.sub() и паттерн r'[^\w\s]'.
7. Заменим спец. символы на пробелы, используя re.sub() и паттерн r'[^a-zA-Z0-9]'.
8. Заменим числа на пробелы, используя re.sub() и паттерн r'[^a-zA-Z]'.
9. Удалим из текста слова длиной в 1 символ, используя ' '.join([w for w in x.split() if len(w)>1]).
10. Поделим твиты на токены с помощью nltk.tokenize.word_tokenize, создав новый столбец 'tweet_token'.
11. Удалим стоп-слова из токенов, используя nltk.corpus.stopwords. Создадим столбец 'tweet_token_filtered' без стоп-слов.
12. Применим стемминг к токенам с помощью nltk.stem.PorterStemmer. Создадим столбец 'tweet_stemmed' после применения стемминга.
13. Применим лемматизацию к токенам с помощью nltk.stem.wordnet.WordNetLemmatizer. Создадим столбец 'tweet_lemmatized' после применения лемматизации.
14. Сохраним результат предобработки в pickle-файл.


In [4]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import pickle
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
import os

In [37]:
emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}

## Загрузка данных

In [2]:
train_df = pd.read_csv('train_tweets.csv')
train_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
test_df = pd.read_csv('test_tweets.csv')
test_df.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [5]:
tweet_df = train_df.append(test_df, ignore_index = True, sort = False)
tweet_df.head()

Unnamed: 0,id,label,tweet
0,1,0.0,@user when a father is dysfunctional and is s...
1,2,0.0,@user @user thanks for #lyft credit i can't us...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation


In [6]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49159 entries, 0 to 49158
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      49159 non-null  int64  
 1   label   31962 non-null  float64
 2   tweet   49159 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 1.1+ MB


### 1. Удаление @user из всех твитов

In [23]:
def del_user_from_text(text):
    #ищем все вхождения паттерна '@[\w]*' в твите
    user_list = re.findall(r'@[\w]*', text)
    #заменяем все @user на пробел
    for user in user_list:
        text =  re.sub(user, ' ',  text) 
    return text

In [24]:
tweet_df['tweet'][1]

"@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked"

In [25]:
del_user_from_text(tweet_df['tweet'][1])

"    thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked"

In [26]:
tweet_df['tweet'] = tweet_df['tweet'].apply(del_user_from_text)
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so se...
1,2,0.0,thanks for #lyft credit i can't use cause ...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation
...,...,...,...
49154,49155,,thought factory: left-right polarisation! #tru...
49155,49156,,feeling like a mermaid ð #hairflip #neverre...
49156,49157,,#hillary #campaigned today in #ohio((omg)) &am...
49157,49158,,"happy, at work conference: right mindset leads..."


### 2. Изменение регистра твитов

In [27]:
tweet_df['tweet'] = tweet_df['tweet'].str.lower()
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so se...
1,2,0.0,thanks for #lyft credit i can't use cause ...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation
...,...,...,...
49154,49155,,thought factory: left-right polarisation! #tru...
49155,49156,,feeling like a mermaid ð #hairflip #neverre...
49156,49157,,#hillary #campaigned today in #ohio((omg)) &am...
49157,49158,,"happy, at work conference: right mindset leads..."


### 3. Замена сокращений с апострофами на пробел

In [28]:
apostrophe_dict = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [33]:
def replacement_in_text(text, word_dict):
    for word in set(text.split()):
        if word in word_dict:
            text  = re.sub(re.escape(word), f'{word_dict.get(word)}', text)
    return text

In [34]:
replacement_in_text(tweet_df['tweet'][1], apostrophe_dict)

'    thanks for #lyft credit i cannot use cause they do not offer wheelchair vans in pdx.    #disapointed #getthanked'

In [35]:
tweet_df['tweet'] = tweet_df['tweet'].apply(lambda x: replacement_in_text(x, apostrophe_dict))
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so se...
1,2,0.0,thanks for #lyft credit i cannot use cause...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love u take with u all the time in ...
4,5,0.0,factsguide: society now #motivation
...,...,...,...
49154,49155,,thought factory: left-right polarisation! #tru...
49155,49156,,feeling like a mermaid ð #hairflip #neverre...
49156,49157,,#hillary #campaigned today in #ohio((omg)) &am...
49157,49158,,"happy, at work conference: right mindset leads..."


### 4. Замена сокращений на их полные формы

In [32]:
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

In [36]:
tweet_df['tweet'] = tweet_df['tweet'].apply(lambda x: replacement_in_text(x, short_word_dict))
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so se...
1,2,0.0,thanks for #lyft credit i cannot use cause...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love you take with you all the time...
4,5,0.0,factsguide: society now #motivation
...,...,...,...
49154,49155,,thought factory: left-right polarisation! #tru...
49155,49156,,feeling like a mermaid ð #hairflip #neverre...
49156,49157,,#hillary #campaigned today in #ohio((omg)) &am...
49157,49158,,"happy, at work conference: right mindset leads..."


### 5. Замена эмотиконов на пробелы

In [38]:
tweet_df['tweet'] = tweet_df['tweet'].apply(lambda x: replacement_in_text(x, emoticon_dict))
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so se...
1,2,0.0,thanks for #lyft credit i cannot use cause...
2,3,0.0,bihday your majesty
3,4,0.0,#model i love you take with you all the time...
4,5,0.0,factsguide: society now #motivation
...,...,...,...
49154,49155,,thought factory: left-right polarisation! #tru...
49155,49156,,feeling like a mermaid ð #hairflip #neverre...
49156,49157,,#hillary #campaigned today in #ohio((omg)) &am...
49157,49158,,"happy, at work conference: right mindset leads..."


### 6. Замена пунктуации на пробелы

In [39]:
#заменяем пунктуацию на пробел
def del_punctuation_from_text(text):
    text =  re.sub(r'[^\w\s]', r' ',  text) 
    return text

In [42]:
tweet_df['tweet'][3]

'#model   i love you take with you all the time in yourð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  '

In [43]:
del_punctuation_from_text(tweet_df['tweet'][3])

' model   i love you take with you all the time in yourð       ð   ð   ð   ð  \x85ð   ð   ð     '

In [44]:
tweet_df['tweet'] = tweet_df['tweet'].apply(del_punctuation_from_text)
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so se...
1,2,0.0,thanks for lyft credit i cannot use cause...
2,3,0.0,bihday your majesty
3,4,0.0,model i love you take with you all the time...
4,5,0.0,factsguide society now motivation
...,...,...,...
49154,49155,,thought factory left right polarisation tru...
49155,49156,,feeling like a mermaid ð hairflip neverre...
49156,49157,,hillary campaigned today in ohio omg am...
49157,49158,,happy at work conference right mindset leads...


### 7. Замена спец. символов на пробелы

In [45]:
def del_special_character_from_text(text):
    text =  re.sub(r'[^a-zA-Z0-9]', r' ',  text) 
    return text

In [46]:
tweet_df['tweet'][3]

' model   i love you take with you all the time in yourð       ð   ð   ð   ð  \x85ð   ð   ð     '

In [47]:
del_special_character_from_text(tweet_df['tweet'][3])

' model   i love you take with you all the time in your                                      '

In [48]:
tweet_df['tweet'] = tweet_df['tweet'].apply(del_special_character_from_text)
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so se...
1,2,0.0,thanks for lyft credit i cannot use cause...
2,3,0.0,bihday your majesty
3,4,0.0,model i love you take with you all the time...
4,5,0.0,factsguide society now motivation
...,...,...,...
49154,49155,,thought factory left right polarisation tru...
49155,49156,,feeling like a mermaid hairflip neverre...
49156,49157,,hillary campaigned today in ohio omg am...
49157,49158,,happy at work conference right mindset leads...


### 8. Замена чисел на пробелы

In [49]:
def del_number_from_text(text):
    text =  re.sub(r'[^a-zA-Z]', r' ',  text) 
    return text

In [58]:
tweet_df['tweet'][9]

'     welcome here    I am   it has   it is so  gr8   '

In [60]:
del_number_from_text(tweet_df['tweet'][9])

'     welcome here    I am   it has   it is so  gr    '

In [61]:
tweet_df['tweet'] = tweet_df['tweet'].apply(del_number_from_text)
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when a father is dysfunctional and is so se...
1,2,0.0,thanks for lyft credit i cannot use cause...
2,3,0.0,bihday your majesty
3,4,0.0,model i love you take with you all the time...
4,5,0.0,factsguide society now motivation
...,...,...,...
49154,49155,,thought factory left right polarisation tru...
49155,49156,,feeling like a mermaid hairflip neverre...
49156,49157,,hillary campaigned today in ohio omg am...
49157,49158,,happy at work conference right mindset leads...


### 9. Удаление слов длиной в 1 символ

In [62]:
def del_word_of_len_1_from_text(text):
    text = ' '.join([w for w in text.split() if len(w)>1])
    return text

In [63]:
tweet_df['tweet'][12]

'i get to see my daddy today        days  gettingfed'

In [64]:
del_word_of_len_1_from_text(tweet_df['tweet'][12])

'get to see my daddy today days gettingfed'

In [65]:
tweet_df['tweet'] = tweet_df['tweet'].apply(del_word_of_len_1_from_text)
tweet_df

Unnamed: 0,id,label,tweet
0,1,0.0,when father is dysfunctional and is so selfish...
1,2,0.0,thanks for lyft credit cannot use cause they d...
2,3,0.0,bihday your majesty
3,4,0.0,model love you take with you all the time in your
4,5,0.0,factsguide society now motivation
...,...,...,...
49154,49155,,thought factory left right polarisation trump ...
49155,49156,,feeling like mermaid hairflip neverready forma...
49156,49157,,hillary campaigned today in ohio omg amp used ...
49157,49158,,happy at work conference right mindset leads t...


### 10. Токенизация

In [66]:
def get_token_from_text(text):
    token =  word_tokenize(text)
    return token

In [67]:
tweet_df['tweet'][17]

'retweet if you agree'

In [68]:
get_token_from_text(tweet_df['tweet'][17])

['retweet', 'if', 'you', 'agree']

In [69]:
tweet_df['tweet_token'] = tweet_df['tweet'].apply(get_token_from_text)
tweet_df

Unnamed: 0,id,label,tweet,tweet_token
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau..."
2,3,0.0,bihday your majesty,"[bihday, your, majesty]"
3,4,0.0,model love you take with you all the time in your,"[model, love, you, take, with, you, all, the, ..."
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]"
...,...,...,...,...
49154,49155,,thought factory left right polarisation trump ...,"[thought, factory, left, right, polarisation, ..."
49155,49156,,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,..."
49156,49157,,hillary campaigned today in ohio omg amp used ...,"[hillary, campaigned, today, in, ohio, omg, am..."
49157,49158,,happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ..."


### 11. Удаление стоп-слов

In [70]:
def del_stop_word_from_token(token):
    stop_words = set(stopwords.words("english"))
    token_without_stop_words = [word for word in token if not word in stop_words]
   
    return token_without_stop_words

In [71]:
tweet_df['tweet_token_filtered'] = tweet_df['tweet_token'].apply(del_stop_word_from_token)
tweet_df

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee..."
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]"
3,4,0.0,model love you take with you all the time in your,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time]"
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]"
...,...,...,...,...,...
49154,49155,,thought factory left right polarisation trump ...,"[thought, factory, left, right, polarisation, ...","[thought, factory, left, right, polarisation, ..."
49155,49156,,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,...","[feeling, like, mermaid, hairflip, neverready,..."
49156,49157,,hillary campaigned today in ohio omg amp used ...,"[hillary, campaigned, today, in, ohio, omg, am...","[hillary, campaigned, today, ohio, omg, amp, u..."
49157,49158,,happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ...","[happy, work, conference, right, mindset, lead..."


### 12. Применение стемминга к токенам

In [72]:
def get_stemming_from_token(token):
    ps = PorterStemmer()
    token_stemming = [ps.stem(word) for word in token]
    return token_stemming

In [73]:
tweet_df['tweet_stemmed'] = tweet_df['tweet_token'].apply(get_stemming_from_token)
tweet_df

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[when, father, is, dysfunct, and, is, so, self..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, for, lyft, credit, can, not, use, caus..."
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, your, majesti]"
3,4,0.0,model love you take with you all the time in your,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time]","[model, love, you, take, with, you, all, the, ..."
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, now, motiv]"
...,...,...,...,...,...,...
49154,49155,,thought factory left right polarisation trump ...,"[thought, factory, left, right, polarisation, ...","[thought, factory, left, right, polarisation, ...","[thought, factori, left, right, polaris, trump..."
49155,49156,,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,...","[feeling, like, mermaid, hairflip, neverready,...","[feel, like, mermaid, hairflip, neverreadi, fo..."
49156,49157,,hillary campaigned today in ohio omg amp used ...,"[hillary, campaigned, today, in, ohio, omg, am...","[hillary, campaigned, today, ohio, omg, amp, u...","[hillari, campaign, today, in, ohio, omg, amp,..."
49157,49158,,happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ...","[happy, work, conference, right, mindset, lead...","[happi, at, work, confer, right, mindset, lead..."


### 13. Применение лемматизации к токенам

In [74]:
def get_lemmatization_from_token(token):
    lem = WordNetLemmatizer()
    token_lemmatization = [lem.lemmatize(word, pos = wordnet.VERB) for word in token]
    return token_lemmatization

In [75]:
tweet_df['tweet_lemmatized'] = tweet_df['tweet_token'].apply(get_lemmatization_from_token)
tweet_df

Unnamed: 0,id,label,tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[when, father, is, dysfunct, and, is, so, self...","[when, father, be, dysfunctional, and, be, so,..."
1,2,0.0,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, for, lyft, credit, can, not, use, caus...","[thank, for, lyft, credit, can, not, use, caus..."
2,3,0.0,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, your, majesti]","[bihday, your, majesty]"
3,4,0.0,model love you take with you all the time in your,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time]","[model, love, you, take, with, you, all, the, ...","[model, love, you, take, with, you, all, the, ..."
4,5,0.0,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, now, motiv]","[factsguide, society, now, motivation]"
...,...,...,...,...,...,...,...
49154,49155,,thought factory left right polarisation trump ...,"[thought, factory, left, right, polarisation, ...","[thought, factory, left, right, polarisation, ...","[thought, factori, left, right, polaris, trump...","[think, factory, leave, right, polarisation, t..."
49155,49156,,feeling like mermaid hairflip neverready forma...,"[feeling, like, mermaid, hairflip, neverready,...","[feeling, like, mermaid, hairflip, neverready,...","[feel, like, mermaid, hairflip, neverreadi, fo...","[feel, like, mermaid, hairflip, neverready, fo..."
49156,49157,,hillary campaigned today in ohio omg amp used ...,"[hillary, campaigned, today, in, ohio, omg, am...","[hillary, campaigned, today, ohio, omg, amp, u...","[hillari, campaign, today, in, ohio, omg, amp,...","[hillary, campaign, today, in, ohio, omg, amp,..."
49157,49158,,happy at work conference right mindset leads t...,"[happy, at, work, conference, right, mindset, ...","[happy, work, conference, right, mindset, lead...","[happi, at, work, confer, right, mindset, lead...","[happy, at, work, conference, right, mindset, ..."


### 14. Сохранение результата

In [76]:
with open('tweet_prep.pickle', 'wb') as f:
    pickle.dump(tweet_df, f)