### Подключение библиотек и загрузка датасета

In [23]:
import numpy as np
import pandas as pd
import re
import nltk
import string

pd.options.mode.chained_assignment = None

full_df = pd.read_csv('tweet_small.csv')

### Просмотр данных

In [24]:
df = full_df[['text']]
df.info()
df.text[0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    93 non-null     object
dtypes: object(1)
memory usage: 876.0+ bytes


'@AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened😡😡😡'

### Перевод к нижнему регистру

In [25]:
df['text_lower'] = df['text'].str.lower()
df.head()

Unnamed: 0,text,text_lower
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...


### Удаление пунктуации

In [26]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))


df['text_wo_punct'] = df['text_lower'].apply(remove_punctuation)
df.head()

Unnamed: 0,text,text_lower,text_wo_punct
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...


### Удаление стоп-слов

In [29]:
from nltk.corpus import stopwords
nltk.download('stopwords')
', '.join(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/artemgolubnichiy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [30]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([word for word in str(text).split(' ') if word not in STOPWORDS])

df['text_wo_stop'] = df['text_wo_punct'].apply(remove_stopwords)
df.head()

Unnamed: 0,text,text_lower,text_wo_punct,text_wo_stop
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...


### Удаление наиболее частых слов

In [31]:
from collections import Counter
cnt = Counter()
for text in df['text_wo_stop'].values:
    for word in text.split():
        cnt[word]+=1

cnt.most_common(10)


[('us', 25),
 ('dm', 19),
 ('help', 18),
 ('thanks', 13),
 ('httpstcogdrqu22ypt', 12),
 ('applesupport', 11),
 ('please', 11),
 ('phone', 9),
 ('hi', 9),
 ('ive', 8)]

In [32]:
FREQWORDS = set([w for (w,wc) in cnt.most_common(10)])

def remove_freqwords(text):
    return ' '.join([word for word in str(text).split(' ') if word not in FREQWORDS])

df['text_wo_stop_freq'] = df['text_wo_stop'].apply(remove_freqwords)
df.head()

Unnamed: 0,text,text_lower,text_wo_punct,text_wo_stop,text_wo_stop_freq
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...,causing reply disregarded tapped notification ...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...,105835 business means lot name zip code additi...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu...,105836 livechat online moment httpstcosy94vtu...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...,virgintrains see attached error message tried ...


### Удаление лишнего

In [33]:
df.drop(['text_wo_punct', 'text_wo_stop'], axis=1, inplace=True)

In [34]:
n_rare_words = 10
RAREWORDS = set([w for (w,wc) in cnt.most_common()[:-n_rare_words-1:-1]])
RAREWORDS

{'browser',
 'green',
 'httpstco9281okeebk',
 'including',
 'keen',
 'lee',
 'line',
 'log',
 'slowdown',
 'thin'}

In [35]:
def remove_rarewords(text):
    return ' '.join([word for word in str(text).split(' ') if word not in RAREWORDS])

df['text_wo_stop_freq_rare'] = df['text_wo_stop_freq'].apply(remove_rarewords)
df.head()

Unnamed: 0,text,text_lower,text_wo_stop_freq,text_wo_stop_freq_rare
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,causing reply disregarded tapped notification ...,causing reply disregarded tapped notification ...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 business means lot name zip code additi...,105835 business means lot name zip code additi...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat online moment httpstcosy94vtu...,105836 livechat online moment httpstcosy94vtu...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message tried ...,virgintrains see attached error message tried ...


### Стемминг

In [36]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_words(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

df['text_stemmed'] = df['text_wo_stop_freq_rare'].apply(stem_words)
df.head()

Unnamed: 0,text,text_lower,text_wo_stop_freq,text_wo_stop_freq_rare,text_stemmed
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,causing reply disregarded tapped notification ...,causing reply disregarded tapped notification ...,caus repli disregard tap notif keyboard opened😡😡😡
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 business means lot name zip code additi...,105835 business means lot name zip code additi...,105835 busi mean lot name zip code addit detai...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 really hope change im sure wont dont,76328 really hope change im sure wont dont,76328 realli hope chang im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat online moment httpstcosy94vtu...,105836 livechat online moment httpstcosy94vtu...,105836 livechat onlin moment httpstcosy94vtu8k...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message tried ...,virgintrains see attached error message tried ...,virgintrain see attach error messag tri leav v...
