### Подкючение библиотек и загрузка датасета

In [2]:
import numpy as np
import pandas as pd
import re
import nltk
import string

pd.options.mode.chained_assignment = None

full_df = pd.read_csv('tweet_small.csv')

### Просмотр данных

In [3]:
print(full_df.head())

   tweet_id     author_id  inbound                      created_at  \
0    119237        105834     True  Wed Oct 11 06:55:44 +0000 2017   
1    119238  ChaseSupport    False  Wed Oct 11 13:25:49 +0000 2017   
2    119239        105835     True  Wed Oct 11 13:00:09 +0000 2017   
3    119240  VirginTrains    False  Tue Oct 10 15:16:08 +0000 2017   
4    119241        105836     True  Tue Oct 10 15:17:21 +0000 2017   

                                                text response_tweet_id  \
0  @AppleSupport causing the reply to be disregar...            119236   
1  @105835 Your business means a lot to us. Pleas...               NaN   
2  @76328 I really hope you all change but I'm su...            119238   
3  @105836 LiveChat is online at the moment - htt...            119241   
4  @VirginTrains see attached error message. I've...            119243   

   in_response_to_tweet_id  
0                      NaN  
1                 119239.0  
2                      NaN  
3                 

In [4]:
df = full_df[['text']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    93 non-null     object
dtypes: object(1)
memory usage: 876.0+ bytes


In [5]:
print(df)

                                                 text
0   @AppleSupport causing the reply to be disregar...
1   @105835 Your business means a lot to us. Pleas...
2   @76328 I really hope you all change but I'm su...
3   @105836 LiveChat is online at the moment - htt...
4   @VirginTrains see attached error message. I've...
..                                                ...
88  @105860 I wish Amazon had an option of where I...
89  They reschedule my shit for tomorrow https://t...
90  @105861 Hey Sara, sorry to hear of the issues ...
91  @Tesco bit of both - finding the layout cumber...
92  @105861 If that doesn't help please DM your fu...

[93 rows x 1 columns]


### Перевод к нижнему регистру

In [6]:
df['text_lower'] = df['text'].str.lower()
print(df.head())

                                                text  \
0  @AppleSupport causing the reply to be disregar...   
1  @105835 Your business means a lot to us. Pleas...   
2  @76328 I really hope you all change but I'm su...   
3  @105836 LiveChat is online at the moment - htt...   
4  @VirginTrains see attached error message. I've...   

                                          text_lower  
0  @applesupport causing the reply to be disregar...  
1  @105835 your business means a lot to us. pleas...  
2  @76328 i really hope you all change but i'm su...  
3  @105836 livechat is online at the moment - htt...  
4  @virgintrains see attached error message. i've...  


### Удаление пунктуации

In [8]:
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df['text_wo_punct'] = df['text_lower'].apply(remove_punctuation)
df.head()

Unnamed: 0,text,text_lower,text_wo_punct
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...


### Удаление стоп-слов

In [9]:
from nltk.corpus import stopwords
nltk.download('stopwords')
', '.join(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/artemgolubnichiy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [10]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return ' '.join([word for word in str(text).split(' ') if word not in STOPWORDS])

df['text_wo_stop'] = df['text_wo_punct'].apply(remove_stopwords)
df.head()

Unnamed: 0,text,text_lower,text_wo_punct,text_wo_stop
0,@AppleSupport causing the reply to be disregar...,@applesupport causing the reply to be disregar...,applesupport causing the reply to be disregard...,applesupport causing reply disregarded tapped ...
1,@105835 Your business means a lot to us. Pleas...,@105835 your business means a lot to us. pleas...,105835 your business means a lot to us please ...,105835 business means lot us please dm name zi...
2,@76328 I really hope you all change but I'm su...,@76328 i really hope you all change but i'm su...,76328 i really hope you all change but im sure...,76328 really hope change im sure wont dont
3,@105836 LiveChat is online at the moment - htt...,@105836 livechat is online at the moment - htt...,105836 livechat is online at the moment https...,105836 livechat online moment httpstcosy94vtu...
4,@VirginTrains see attached error message. I've...,@virgintrains see attached error message. i've...,virgintrains see attached error message ive tr...,virgintrains see attached error message ive tr...
