In [57]:
import re
import emoji
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [58]:
path = 'data/tweet/train.csv'
data = pd.read_csv(path)

data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [59]:
data.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [60]:
data.dropna(inplace=True)
data = data[data['sentiment'] != 'neutral']
data.drop(columns=['textID', 'selected_text'], inplace=True)
data.shape

(16363, 2)

In [61]:
data = data.head(1000)

## Cleaning Data

The steps are as follows:
- Replace repeasted characters (more than twice in a row) with two characters
- Remove all emails
- Replace URLs with 'link'
- Replace all user mentions with 'user_mentions'
- Replace hashtags by keeping the words and removing the '#'
- Convert emoji to descriptive text
- Remove all colons from the emoji text
- Replace underscores with spaces
- Remove numbers and punctuation
- Lowercase all text
- Tokenize the text
- Remove stopwords
- Lemmatize the text
- Stem the text
- Join the text back together

In [62]:
stop_words = set(stopwords.words('english'))
negative_words = {"not", "no", "nor", "never", "n't", "wasn't", "weren't", "isn't", "aren't", "don't", "didn't", "hasn't", "haven't", "hadn't", "won't", "wouldn't", "couldn't", "shouldn't", "mustn't", "mightn't", "needn't"}
custom_stop_words = stop_words - negative_words

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [63]:
def clean_text(text):
  text = re.sub(r'(.)\1{2,}', r'\1\1', text)
  text = re.sub(r'\S+@\S+', '', text)
  text = re.sub(r'http\S+|www.\S+', 'link', text)
  text = re.sub(r'@\w+', 'user_mention', text)
  text = re.sub(r'#(\w+)', r'\1', text)

  text = emoji.demojize(text)
  text = re.sub(r':', '', text)
  text = text.replace('_', ' ')
  text = re.sub(r'[^\w\s]', '', text)

  text = text.lower()
  word = word_tokenize(text)
  word = [word for word in word if word.isalpha()]
  word = [word for word in word if word not in custom_stop_words]
  word = [lemmatizer.lemmatize(word) for word in word]
  word = [stemmer.stem(word) for word in word]

  text = ' '.join(word)
  return text

data['text'] = data['text'].apply(clean_text)

### Mapping categorical data
Here we map positive as 1 and negative as 0

In [64]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})
data.head()

Unnamed: 0,text,sentiment
1,soo sad miss san diego,0
2,bo bulli,0
3,interview leav alon,0
4,son couldnt put releas alreadi bought,0
6,feed babi fun smile coo,1
